fixed style and lint

2025-10-13 01:02:26 +00:00 · 2025-04-15 15:14:00 -07:00 · 2025-04-15 15:14:00 -07:00 · c72b8fb47c
commit c72b8fb47c
parent bc89f90216
3 changed files with 30 additions and 29 deletions
--- a/olmocr/bench/miners/processing_old_scans.py
+++ b/olmocr/bench/miners/processing_old_scans.py
@ -2,35 +2,37 @@ import json
 import random
 import re

+
 def extract_random_segment(text, min_words=7, max_words=15):
    """Extract a random segment of 7-15 words from the text."""
    words = text.split()
    if len(words) <= max_words:
        return text  # Return full text if it's shorter than max_words
-    
+
    max_start = len(words) - min_words
    start = random.randint(0, max_start)
    remaining_words = len(words) - start
    segment_length = random.randint(min_words, min(max_words, remaining_words))
-    segment = words[start:start + segment_length]
-    return ' '.join(segment)
+    segment = words[start : start + segment_length]
+    return " ".join(segment)
+

 def process_jsonl_file_present(input_file, output_file):
    """Process a JSONL file and create multiple random cases for each PDF."""
-    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
+    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            if line.strip():  # Skip empty lines
                data = json.loads(line)
                image = data["image"]
                original_text = data["text"]
                num_cases = random.randint(1, 3)
-                
+
                for _ in range(num_cases):
                    processed_num = random.randint(5, 10)
                    processed_id = f"{image}_processed{processed_num:02d}"
                    max_diffs = random.randint(1, 2)
                    text_segment = extract_random_segment(original_text)
-                    
+
                    new_case = {
                        "pdf": f"{image}.pdf",
                        "page": 1,
@ -40,15 +42,15 @@ def process_jsonl_file_present(input_file, output_file):
                        "text": text_segment,
                        "case_sensitive": True,
                        "first_n": None,
-                        "last_n": None
+                        "last_n": None,
                    }
-                    outfile.write(json.dumps(new_case) + '\n')
+                    outfile.write(json.dumps(new_case) + "\n")


 def extract_ordered_segments(text, min_words=7, max_words=15):
    """Extract two ordered segments from the text."""
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+
    if len(sentences) < 2:
        return None, None
    valid_indices = list(range(len(sentences)))
@ -62,33 +64,34 @@ def extract_ordered_segments(text, min_words=7, max_words=15):

    before_words = before_sentence.split()
    after_words = after_sentence.split()
-    
+
    if len(before_words) > max_words:
        start = random.randint(0, len(before_words) - min_words)
        length = random.randint(min_words, min(max_words, len(before_words) - start))
-        before_segment = ' '.join(before_words[start:start + length])
+        before_segment = " ".join(before_words[start : start + length])
    else:
        before_segment = before_sentence
-        
+
    if len(after_words) > max_words:
        start = random.randint(0, len(after_words) - min_words)
        length = random.randint(min_words, min(max_words, len(after_words) - start))
-        after_segment = ' '.join(after_words[start:start + length])
+        after_segment = " ".join(after_words[start : start + length])
    else:
        after_segment = after_sentence
-    
+
    return before_segment, after_segment

+
 def process_jsonl_file_order(input_file, output_file):
    """Process a JSONL file and create order-type cases."""
-    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
+    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            if line.strip():  # Skip empty lines
                data = json.loads(line)
                image = data["image"]
                original_text = data["text"]
                num_cases = random.randint(1, 3)
-                
+
                for _ in range(num_cases):
                    before_text, after_text = extract_ordered_segments(original_text)
                    if not before_text or not after_text:
@ -96,7 +99,7 @@ def process_jsonl_file_order(input_file, output_file):
                    processed_num = random.randint(11, 16)
                    processed_id = f"{image}_processed{processed_num:02d}"
                    max_diffs = random.randint(1, 3)
-                    
+
                    new_case = {
                        "pdf": f"{image}.pdf",
                        "page": 1,
@ -106,13 +109,14 @@ def process_jsonl_file_order(input_file, output_file):
                        "after": after_text,
                        "max_diffs": max_diffs,
                        "checked": "verified",
-                        "url": f"https://example.com/document/{image}"
+                        "url": f"https://example.com/document/{image}",
                    }
-                    
-                    outfile.write(json.dumps(new_case) + '\n')
+
+                    outfile.write(json.dumps(new_case) + "\n")
+

 if __name__ == "__main__":
    input_file = "olmoce/bench/sample_data/old_scans.jsonl"
    output_file = "order_cases.jsonl"
    process_jsonl_file_present(input_file, output_file)
-    process_jsonl_file_order(input_file, output_file)
+    process_jsonl_file_order(input_file, output_file)
--- a/olmocr/bench/review_app_latex.py
+++ b/olmocr/bench/review_app_latex.py
@ -244,7 +244,7 @@ def create_templates_directory():
    """Create templates directory for Flask if it doesn't exist."""
    templates_dir = os.path.join(os.path.dirname(__file__), "templates")
    os.makedirs(templates_dir, exist_ok=True)
-    
+
    # Create the review_latex.html template with MathJax support
    review_html = """
 <!DOCTYPE html>
@ -607,7 +607,7 @@ def create_templates_directory():
 </body>
 </html>
    """
-    
+
    # Create the all_done_latex.html template
    all_done_html = """
 <!DOCTYPE html>
@ -663,11 +663,10 @@ def create_templates_directory():
 </body>
 </html>
    """
-    

    with open(os.path.join(templates_dir, "review_latex.html"), "w") as f:
        f.write(review_html)
-        
+
    with open(os.path.join(templates_dir, "all_done_latex.html"), "w") as f:
        f.write(all_done_html)

@ -690,7 +689,6 @@ def main():
        print(f"Error: Dataset not found: {args.dataset_file}")
        return 1

-
    DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file))
    DATASET_FILE = args.dataset_file

@ -715,4 +713,4 @@ def main():


 if __name__ == "__main__":
-    sys.exit(main())
+    sys.exit(main())
--- a/olmocr/bench/runners/run_rolmocr.py
+++ b/olmocr/bench/runners/run_rolmocr.py
@ -1,4 +1,3 @@
-
 import httpx

 from olmocr.data.renderpdf import render_pdf_to_base64png