olmocr/randomm.py
2025-04-14 13:25:20 -07:00

150 lines
6.0 KiB
Python

# import json
# import random
# def extract_random_segment(text, min_words=7, max_words=15):
# """Extract a random segment of 7-15 words from the text."""
# words = text.split()
# if len(words) <= max_words:
# return text # Return full text if it's shorter than max_words
# # Choose a random starting point
# max_start = len(words) - min_words
# start = random.randint(0, max_start)
# # Choose a random length between min_words and max_words
# # or the remaining words if less than max_words
# remaining_words = len(words) - start
# segment_length = random.randint(min_words, min(max_words, remaining_words))
# # Extract the segment
# segment = words[start:start + segment_length]
# return ' '.join(segment)
# def process_jsonl_file(input_file, output_file):
# """Process a JSONL file and create multiple random cases for each PDF."""
# with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
# for line in infile:
# if line.strip(): # Skip empty lines
# data = json.loads(line)
# image = data["image"]
# original_text = data["text"]
# # Generate between 1-5 random cases for each PDF
# num_cases = random.randint(1, 3)
# for _ in range(num_cases):
# # Create a new JSON object with random values
# processed_num = random.randint(5, 10)
# processed_id = f"{image}_processed{processed_num:02d}"
# max_diffs = random.randint(1, 2)
# text_segment = extract_random_segment(original_text)
# new_case = {
# "pdf": f"{image}.pdf",
# "page": 1,
# "id": processed_id,
# "type": "present",
# "max_diffs": max_diffs,
# "text": text_segment,
# "case_sensitive": True,
# "first_n": None,
# "last_n": None
# }
# outfile.write(json.dumps(new_case) + '\n')
# if __name__ == "__main__":
# # Change these filenames to match your actual file paths
# input_file = "abc.jsonl"
# output_file = "output.jsonl"
# process_jsonl_file(input_file, output_file)
import json
import random
import re
def extract_ordered_segments(text, min_words=7, max_words=15):
"""Extract two ordered segments from the text."""
# Split the text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
if len(sentences) < 2:
# Not enough sentences for ordering
return None, None
# Choose two random, non-adjacent sentence indices
valid_indices = list(range(len(sentences)))
if len(valid_indices) <= 2:
before_idx, after_idx = 0, 1 # If only 2 sentences, use both
else:
# Ensure after_idx > before_idx to maintain proper ordering
before_idx = random.randint(0, len(valid_indices) - 2)
after_idx = random.randint(before_idx + 1, len(valid_indices) - 1)
# Extract the sentences
before_sentence = sentences[before_idx]
after_sentence = sentences[after_idx]
# If sentences are too long, extract segments
before_words = before_sentence.split()
after_words = after_sentence.split()
if len(before_words) > max_words:
start = random.randint(0, len(before_words) - min_words)
length = random.randint(min_words, min(max_words, len(before_words) - start))
before_segment = ' '.join(before_words[start:start + length])
else:
before_segment = before_sentence
if len(after_words) > max_words:
start = random.randint(0, len(after_words) - min_words)
length = random.randint(min_words, min(max_words, len(after_words) - start))
after_segment = ' '.join(after_words[start:start + length])
else:
after_segment = after_sentence
return before_segment, after_segment
def process_jsonl_file(input_file, output_file):
"""Process a JSONL file and create order-type cases."""
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for line in infile:
if line.strip(): # Skip empty lines
data = json.loads(line)
image = data["image"]
original_text = data["text"]
# Generate between 1-5 random cases for each PDF
num_cases = random.randint(1, 3)
for _ in range(num_cases):
# Extract ordered segments
before_text, after_text = extract_ordered_segments(original_text)
# If we couldn't extract valid segments, skip this case
if not before_text or not after_text:
continue
# Create a new JSON object with random values
processed_num = random.randint(11, 16)
processed_id = f"{image}_processed{processed_num:02d}"
max_diffs = random.randint(1, 3)
new_case = {
"pdf": f"{image}.pdf",
"page": 1,
"id": processed_id,
"type": "order",
"before": before_text,
"after": after_text,
"max_diffs": max_diffs,
"checked": "verified",
"url": f"https://example.com/document/{image}"
}
outfile.write(json.dumps(new_case) + '\n')
if __name__ == "__main__":
input_file = "abc.jsonl"
output_file = "order_cases.jsonl"
process_jsonl_file(input_file, output_file)