mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-13 01:02:26 +00:00
150 lines
6.0 KiB
Python
150 lines
6.0 KiB
Python
# import json
|
|
# import random
|
|
|
|
# def extract_random_segment(text, min_words=7, max_words=15):
|
|
# """Extract a random segment of 7-15 words from the text."""
|
|
# words = text.split()
|
|
# if len(words) <= max_words:
|
|
# return text # Return full text if it's shorter than max_words
|
|
|
|
# # Choose a random starting point
|
|
# max_start = len(words) - min_words
|
|
# start = random.randint(0, max_start)
|
|
|
|
# # Choose a random length between min_words and max_words
|
|
# # or the remaining words if less than max_words
|
|
# remaining_words = len(words) - start
|
|
# segment_length = random.randint(min_words, min(max_words, remaining_words))
|
|
|
|
# # Extract the segment
|
|
# segment = words[start:start + segment_length]
|
|
# return ' '.join(segment)
|
|
|
|
# def process_jsonl_file(input_file, output_file):
|
|
# """Process a JSONL file and create multiple random cases for each PDF."""
|
|
# with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
|
|
# for line in infile:
|
|
# if line.strip(): # Skip empty lines
|
|
# data = json.loads(line)
|
|
# image = data["image"]
|
|
# original_text = data["text"]
|
|
|
|
# # Generate between 1-5 random cases for each PDF
|
|
# num_cases = random.randint(1, 3)
|
|
|
|
# for _ in range(num_cases):
|
|
# # Create a new JSON object with random values
|
|
# processed_num = random.randint(5, 10)
|
|
# processed_id = f"{image}_processed{processed_num:02d}"
|
|
# max_diffs = random.randint(1, 2)
|
|
# text_segment = extract_random_segment(original_text)
|
|
|
|
# new_case = {
|
|
# "pdf": f"{image}.pdf",
|
|
# "page": 1,
|
|
# "id": processed_id,
|
|
# "type": "present",
|
|
# "max_diffs": max_diffs,
|
|
# "text": text_segment,
|
|
# "case_sensitive": True,
|
|
# "first_n": None,
|
|
# "last_n": None
|
|
# }
|
|
|
|
# outfile.write(json.dumps(new_case) + '\n')
|
|
|
|
# if __name__ == "__main__":
|
|
# # Change these filenames to match your actual file paths
|
|
# input_file = "abc.jsonl"
|
|
# output_file = "output.jsonl"
|
|
# process_jsonl_file(input_file, output_file)
|
|
|
|
import json
|
|
import random
|
|
import re
|
|
|
|
def extract_ordered_segments(text, min_words=7, max_words=15):
|
|
"""Extract two ordered segments from the text."""
|
|
# Split the text into sentences
|
|
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
|
|
if len(sentences) < 2:
|
|
# Not enough sentences for ordering
|
|
return None, None
|
|
|
|
# Choose two random, non-adjacent sentence indices
|
|
valid_indices = list(range(len(sentences)))
|
|
if len(valid_indices) <= 2:
|
|
before_idx, after_idx = 0, 1 # If only 2 sentences, use both
|
|
else:
|
|
# Ensure after_idx > before_idx to maintain proper ordering
|
|
before_idx = random.randint(0, len(valid_indices) - 2)
|
|
after_idx = random.randint(before_idx + 1, len(valid_indices) - 1)
|
|
|
|
# Extract the sentences
|
|
before_sentence = sentences[before_idx]
|
|
after_sentence = sentences[after_idx]
|
|
|
|
# If sentences are too long, extract segments
|
|
before_words = before_sentence.split()
|
|
after_words = after_sentence.split()
|
|
|
|
if len(before_words) > max_words:
|
|
start = random.randint(0, len(before_words) - min_words)
|
|
length = random.randint(min_words, min(max_words, len(before_words) - start))
|
|
before_segment = ' '.join(before_words[start:start + length])
|
|
else:
|
|
before_segment = before_sentence
|
|
|
|
if len(after_words) > max_words:
|
|
start = random.randint(0, len(after_words) - min_words)
|
|
length = random.randint(min_words, min(max_words, len(after_words) - start))
|
|
after_segment = ' '.join(after_words[start:start + length])
|
|
else:
|
|
after_segment = after_sentence
|
|
|
|
return before_segment, after_segment
|
|
|
|
def process_jsonl_file(input_file, output_file):
|
|
"""Process a JSONL file and create order-type cases."""
|
|
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
|
|
for line in infile:
|
|
if line.strip(): # Skip empty lines
|
|
data = json.loads(line)
|
|
image = data["image"]
|
|
original_text = data["text"]
|
|
|
|
# Generate between 1-5 random cases for each PDF
|
|
num_cases = random.randint(1, 3)
|
|
|
|
for _ in range(num_cases):
|
|
# Extract ordered segments
|
|
before_text, after_text = extract_ordered_segments(original_text)
|
|
|
|
# If we couldn't extract valid segments, skip this case
|
|
if not before_text or not after_text:
|
|
continue
|
|
|
|
# Create a new JSON object with random values
|
|
processed_num = random.randint(11, 16)
|
|
processed_id = f"{image}_processed{processed_num:02d}"
|
|
max_diffs = random.randint(1, 3)
|
|
|
|
new_case = {
|
|
"pdf": f"{image}.pdf",
|
|
"page": 1,
|
|
"id": processed_id,
|
|
"type": "order",
|
|
"before": before_text,
|
|
"after": after_text,
|
|
"max_diffs": max_diffs,
|
|
"checked": "verified",
|
|
"url": f"https://example.com/document/{image}"
|
|
}
|
|
|
|
outfile.write(json.dumps(new_case) + '\n')
|
|
|
|
if __name__ == "__main__":
|
|
input_file = "abc.jsonl"
|
|
output_file = "order_cases.jsonl"
|
|
process_jsonl_file(input_file, output_file) |