fixed style and lint

This commit is contained in:
aman-17 2025-04-15 15:14:00 -07:00
parent bc89f90216
commit c72b8fb47c
3 changed files with 30 additions and 29 deletions

View File

@ -2,35 +2,37 @@ import json
import random import random
import re import re
def extract_random_segment(text, min_words=7, max_words=15): def extract_random_segment(text, min_words=7, max_words=15):
"""Extract a random segment of 7-15 words from the text.""" """Extract a random segment of 7-15 words from the text."""
words = text.split() words = text.split()
if len(words) <= max_words: if len(words) <= max_words:
return text # Return full text if it's shorter than max_words return text # Return full text if it's shorter than max_words
max_start = len(words) - min_words max_start = len(words) - min_words
start = random.randint(0, max_start) start = random.randint(0, max_start)
remaining_words = len(words) - start remaining_words = len(words) - start
segment_length = random.randint(min_words, min(max_words, remaining_words)) segment_length = random.randint(min_words, min(max_words, remaining_words))
segment = words[start:start + segment_length] segment = words[start : start + segment_length]
return ' '.join(segment) return " ".join(segment)
def process_jsonl_file_present(input_file, output_file): def process_jsonl_file_present(input_file, output_file):
"""Process a JSONL file and create multiple random cases for each PDF.""" """Process a JSONL file and create multiple random cases for each PDF."""
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile: with open(input_file, "r") as infile, open(output_file, "w") as outfile:
for line in infile: for line in infile:
if line.strip(): # Skip empty lines if line.strip(): # Skip empty lines
data = json.loads(line) data = json.loads(line)
image = data["image"] image = data["image"]
original_text = data["text"] original_text = data["text"]
num_cases = random.randint(1, 3) num_cases = random.randint(1, 3)
for _ in range(num_cases): for _ in range(num_cases):
processed_num = random.randint(5, 10) processed_num = random.randint(5, 10)
processed_id = f"{image}_processed{processed_num:02d}" processed_id = f"{image}_processed{processed_num:02d}"
max_diffs = random.randint(1, 2) max_diffs = random.randint(1, 2)
text_segment = extract_random_segment(original_text) text_segment = extract_random_segment(original_text)
new_case = { new_case = {
"pdf": f"{image}.pdf", "pdf": f"{image}.pdf",
"page": 1, "page": 1,
@ -40,15 +42,15 @@ def process_jsonl_file_present(input_file, output_file):
"text": text_segment, "text": text_segment,
"case_sensitive": True, "case_sensitive": True,
"first_n": None, "first_n": None,
"last_n": None "last_n": None,
} }
outfile.write(json.dumps(new_case) + '\n') outfile.write(json.dumps(new_case) + "\n")
def extract_ordered_segments(text, min_words=7, max_words=15): def extract_ordered_segments(text, min_words=7, max_words=15):
"""Extract two ordered segments from the text.""" """Extract two ordered segments from the text."""
sentences = re.split(r'(?<=[.!?])\s+', text) sentences = re.split(r"(?<=[.!?])\s+", text)
if len(sentences) < 2: if len(sentences) < 2:
return None, None return None, None
valid_indices = list(range(len(sentences))) valid_indices = list(range(len(sentences)))
@ -62,33 +64,34 @@ def extract_ordered_segments(text, min_words=7, max_words=15):
before_words = before_sentence.split() before_words = before_sentence.split()
after_words = after_sentence.split() after_words = after_sentence.split()
if len(before_words) > max_words: if len(before_words) > max_words:
start = random.randint(0, len(before_words) - min_words) start = random.randint(0, len(before_words) - min_words)
length = random.randint(min_words, min(max_words, len(before_words) - start)) length = random.randint(min_words, min(max_words, len(before_words) - start))
before_segment = ' '.join(before_words[start:start + length]) before_segment = " ".join(before_words[start : start + length])
else: else:
before_segment = before_sentence before_segment = before_sentence
if len(after_words) > max_words: if len(after_words) > max_words:
start = random.randint(0, len(after_words) - min_words) start = random.randint(0, len(after_words) - min_words)
length = random.randint(min_words, min(max_words, len(after_words) - start)) length = random.randint(min_words, min(max_words, len(after_words) - start))
after_segment = ' '.join(after_words[start:start + length]) after_segment = " ".join(after_words[start : start + length])
else: else:
after_segment = after_sentence after_segment = after_sentence
return before_segment, after_segment return before_segment, after_segment
def process_jsonl_file_order(input_file, output_file): def process_jsonl_file_order(input_file, output_file):
"""Process a JSONL file and create order-type cases.""" """Process a JSONL file and create order-type cases."""
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile: with open(input_file, "r") as infile, open(output_file, "w") as outfile:
for line in infile: for line in infile:
if line.strip(): # Skip empty lines if line.strip(): # Skip empty lines
data = json.loads(line) data = json.loads(line)
image = data["image"] image = data["image"]
original_text = data["text"] original_text = data["text"]
num_cases = random.randint(1, 3) num_cases = random.randint(1, 3)
for _ in range(num_cases): for _ in range(num_cases):
before_text, after_text = extract_ordered_segments(original_text) before_text, after_text = extract_ordered_segments(original_text)
if not before_text or not after_text: if not before_text or not after_text:
@ -96,7 +99,7 @@ def process_jsonl_file_order(input_file, output_file):
processed_num = random.randint(11, 16) processed_num = random.randint(11, 16)
processed_id = f"{image}_processed{processed_num:02d}" processed_id = f"{image}_processed{processed_num:02d}"
max_diffs = random.randint(1, 3) max_diffs = random.randint(1, 3)
new_case = { new_case = {
"pdf": f"{image}.pdf", "pdf": f"{image}.pdf",
"page": 1, "page": 1,
@ -106,13 +109,14 @@ def process_jsonl_file_order(input_file, output_file):
"after": after_text, "after": after_text,
"max_diffs": max_diffs, "max_diffs": max_diffs,
"checked": "verified", "checked": "verified",
"url": f"https://example.com/document/{image}" "url": f"https://example.com/document/{image}",
} }
outfile.write(json.dumps(new_case) + '\n') outfile.write(json.dumps(new_case) + "\n")
if __name__ == "__main__": if __name__ == "__main__":
input_file = "olmoce/bench/sample_data/old_scans.jsonl" input_file = "olmoce/bench/sample_data/old_scans.jsonl"
output_file = "order_cases.jsonl" output_file = "order_cases.jsonl"
process_jsonl_file_present(input_file, output_file) process_jsonl_file_present(input_file, output_file)
process_jsonl_file_order(input_file, output_file) process_jsonl_file_order(input_file, output_file)

View File

@ -244,7 +244,7 @@ def create_templates_directory():
"""Create templates directory for Flask if it doesn't exist.""" """Create templates directory for Flask if it doesn't exist."""
templates_dir = os.path.join(os.path.dirname(__file__), "templates") templates_dir = os.path.join(os.path.dirname(__file__), "templates")
os.makedirs(templates_dir, exist_ok=True) os.makedirs(templates_dir, exist_ok=True)
# Create the review_latex.html template with MathJax support # Create the review_latex.html template with MathJax support
review_html = """ review_html = """
<!DOCTYPE html> <!DOCTYPE html>
@ -607,7 +607,7 @@ def create_templates_directory():
</body> </body>
</html> </html>
""" """
# Create the all_done_latex.html template # Create the all_done_latex.html template
all_done_html = """ all_done_html = """
<!DOCTYPE html> <!DOCTYPE html>
@ -663,11 +663,10 @@ def create_templates_directory():
</body> </body>
</html> </html>
""" """
with open(os.path.join(templates_dir, "review_latex.html"), "w") as f: with open(os.path.join(templates_dir, "review_latex.html"), "w") as f:
f.write(review_html) f.write(review_html)
with open(os.path.join(templates_dir, "all_done_latex.html"), "w") as f: with open(os.path.join(templates_dir, "all_done_latex.html"), "w") as f:
f.write(all_done_html) f.write(all_done_html)
@ -690,7 +689,6 @@ def main():
print(f"Error: Dataset not found: {args.dataset_file}") print(f"Error: Dataset not found: {args.dataset_file}")
return 1 return 1
DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file)) DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file))
DATASET_FILE = args.dataset_file DATASET_FILE = args.dataset_file
@ -715,4 +713,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
sys.exit(main()) sys.exit(main())

View File

@ -1,4 +1,3 @@
import httpx import httpx
from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.data.renderpdf import render_pdf_to_base64png