mirror of
https://github.com/allenai/olmocr.git
synced 2025-07-31 21:13:57 +00:00
fixed style and lint
This commit is contained in:
parent
bc89f90216
commit
c72b8fb47c
@ -2,35 +2,37 @@ import json
|
|||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def extract_random_segment(text, min_words=7, max_words=15):
|
def extract_random_segment(text, min_words=7, max_words=15):
|
||||||
"""Extract a random segment of 7-15 words from the text."""
|
"""Extract a random segment of 7-15 words from the text."""
|
||||||
words = text.split()
|
words = text.split()
|
||||||
if len(words) <= max_words:
|
if len(words) <= max_words:
|
||||||
return text # Return full text if it's shorter than max_words
|
return text # Return full text if it's shorter than max_words
|
||||||
|
|
||||||
max_start = len(words) - min_words
|
max_start = len(words) - min_words
|
||||||
start = random.randint(0, max_start)
|
start = random.randint(0, max_start)
|
||||||
remaining_words = len(words) - start
|
remaining_words = len(words) - start
|
||||||
segment_length = random.randint(min_words, min(max_words, remaining_words))
|
segment_length = random.randint(min_words, min(max_words, remaining_words))
|
||||||
segment = words[start:start + segment_length]
|
segment = words[start : start + segment_length]
|
||||||
return ' '.join(segment)
|
return " ".join(segment)
|
||||||
|
|
||||||
|
|
||||||
def process_jsonl_file_present(input_file, output_file):
|
def process_jsonl_file_present(input_file, output_file):
|
||||||
"""Process a JSONL file and create multiple random cases for each PDF."""
|
"""Process a JSONL file and create multiple random cases for each PDF."""
|
||||||
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
|
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
if line.strip(): # Skip empty lines
|
if line.strip(): # Skip empty lines
|
||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
image = data["image"]
|
image = data["image"]
|
||||||
original_text = data["text"]
|
original_text = data["text"]
|
||||||
num_cases = random.randint(1, 3)
|
num_cases = random.randint(1, 3)
|
||||||
|
|
||||||
for _ in range(num_cases):
|
for _ in range(num_cases):
|
||||||
processed_num = random.randint(5, 10)
|
processed_num = random.randint(5, 10)
|
||||||
processed_id = f"{image}_processed{processed_num:02d}"
|
processed_id = f"{image}_processed{processed_num:02d}"
|
||||||
max_diffs = random.randint(1, 2)
|
max_diffs = random.randint(1, 2)
|
||||||
text_segment = extract_random_segment(original_text)
|
text_segment = extract_random_segment(original_text)
|
||||||
|
|
||||||
new_case = {
|
new_case = {
|
||||||
"pdf": f"{image}.pdf",
|
"pdf": f"{image}.pdf",
|
||||||
"page": 1,
|
"page": 1,
|
||||||
@ -40,15 +42,15 @@ def process_jsonl_file_present(input_file, output_file):
|
|||||||
"text": text_segment,
|
"text": text_segment,
|
||||||
"case_sensitive": True,
|
"case_sensitive": True,
|
||||||
"first_n": None,
|
"first_n": None,
|
||||||
"last_n": None
|
"last_n": None,
|
||||||
}
|
}
|
||||||
outfile.write(json.dumps(new_case) + '\n')
|
outfile.write(json.dumps(new_case) + "\n")
|
||||||
|
|
||||||
|
|
||||||
def extract_ordered_segments(text, min_words=7, max_words=15):
|
def extract_ordered_segments(text, min_words=7, max_words=15):
|
||||||
"""Extract two ordered segments from the text."""
|
"""Extract two ordered segments from the text."""
|
||||||
sentences = re.split(r'(?<=[.!?])\s+', text)
|
sentences = re.split(r"(?<=[.!?])\s+", text)
|
||||||
|
|
||||||
if len(sentences) < 2:
|
if len(sentences) < 2:
|
||||||
return None, None
|
return None, None
|
||||||
valid_indices = list(range(len(sentences)))
|
valid_indices = list(range(len(sentences)))
|
||||||
@ -62,33 +64,34 @@ def extract_ordered_segments(text, min_words=7, max_words=15):
|
|||||||
|
|
||||||
before_words = before_sentence.split()
|
before_words = before_sentence.split()
|
||||||
after_words = after_sentence.split()
|
after_words = after_sentence.split()
|
||||||
|
|
||||||
if len(before_words) > max_words:
|
if len(before_words) > max_words:
|
||||||
start = random.randint(0, len(before_words) - min_words)
|
start = random.randint(0, len(before_words) - min_words)
|
||||||
length = random.randint(min_words, min(max_words, len(before_words) - start))
|
length = random.randint(min_words, min(max_words, len(before_words) - start))
|
||||||
before_segment = ' '.join(before_words[start:start + length])
|
before_segment = " ".join(before_words[start : start + length])
|
||||||
else:
|
else:
|
||||||
before_segment = before_sentence
|
before_segment = before_sentence
|
||||||
|
|
||||||
if len(after_words) > max_words:
|
if len(after_words) > max_words:
|
||||||
start = random.randint(0, len(after_words) - min_words)
|
start = random.randint(0, len(after_words) - min_words)
|
||||||
length = random.randint(min_words, min(max_words, len(after_words) - start))
|
length = random.randint(min_words, min(max_words, len(after_words) - start))
|
||||||
after_segment = ' '.join(after_words[start:start + length])
|
after_segment = " ".join(after_words[start : start + length])
|
||||||
else:
|
else:
|
||||||
after_segment = after_sentence
|
after_segment = after_sentence
|
||||||
|
|
||||||
return before_segment, after_segment
|
return before_segment, after_segment
|
||||||
|
|
||||||
|
|
||||||
def process_jsonl_file_order(input_file, output_file):
|
def process_jsonl_file_order(input_file, output_file):
|
||||||
"""Process a JSONL file and create order-type cases."""
|
"""Process a JSONL file and create order-type cases."""
|
||||||
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
|
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
if line.strip(): # Skip empty lines
|
if line.strip(): # Skip empty lines
|
||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
image = data["image"]
|
image = data["image"]
|
||||||
original_text = data["text"]
|
original_text = data["text"]
|
||||||
num_cases = random.randint(1, 3)
|
num_cases = random.randint(1, 3)
|
||||||
|
|
||||||
for _ in range(num_cases):
|
for _ in range(num_cases):
|
||||||
before_text, after_text = extract_ordered_segments(original_text)
|
before_text, after_text = extract_ordered_segments(original_text)
|
||||||
if not before_text or not after_text:
|
if not before_text or not after_text:
|
||||||
@ -96,7 +99,7 @@ def process_jsonl_file_order(input_file, output_file):
|
|||||||
processed_num = random.randint(11, 16)
|
processed_num = random.randint(11, 16)
|
||||||
processed_id = f"{image}_processed{processed_num:02d}"
|
processed_id = f"{image}_processed{processed_num:02d}"
|
||||||
max_diffs = random.randint(1, 3)
|
max_diffs = random.randint(1, 3)
|
||||||
|
|
||||||
new_case = {
|
new_case = {
|
||||||
"pdf": f"{image}.pdf",
|
"pdf": f"{image}.pdf",
|
||||||
"page": 1,
|
"page": 1,
|
||||||
@ -106,13 +109,14 @@ def process_jsonl_file_order(input_file, output_file):
|
|||||||
"after": after_text,
|
"after": after_text,
|
||||||
"max_diffs": max_diffs,
|
"max_diffs": max_diffs,
|
||||||
"checked": "verified",
|
"checked": "verified",
|
||||||
"url": f"https://example.com/document/{image}"
|
"url": f"https://example.com/document/{image}",
|
||||||
}
|
}
|
||||||
|
|
||||||
outfile.write(json.dumps(new_case) + '\n')
|
outfile.write(json.dumps(new_case) + "\n")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_file = "olmoce/bench/sample_data/old_scans.jsonl"
|
input_file = "olmoce/bench/sample_data/old_scans.jsonl"
|
||||||
output_file = "order_cases.jsonl"
|
output_file = "order_cases.jsonl"
|
||||||
process_jsonl_file_present(input_file, output_file)
|
process_jsonl_file_present(input_file, output_file)
|
||||||
process_jsonl_file_order(input_file, output_file)
|
process_jsonl_file_order(input_file, output_file)
|
||||||
|
@ -244,7 +244,7 @@ def create_templates_directory():
|
|||||||
"""Create templates directory for Flask if it doesn't exist."""
|
"""Create templates directory for Flask if it doesn't exist."""
|
||||||
templates_dir = os.path.join(os.path.dirname(__file__), "templates")
|
templates_dir = os.path.join(os.path.dirname(__file__), "templates")
|
||||||
os.makedirs(templates_dir, exist_ok=True)
|
os.makedirs(templates_dir, exist_ok=True)
|
||||||
|
|
||||||
# Create the review_latex.html template with MathJax support
|
# Create the review_latex.html template with MathJax support
|
||||||
review_html = """
|
review_html = """
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
@ -607,7 +607,7 @@ def create_templates_directory():
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Create the all_done_latex.html template
|
# Create the all_done_latex.html template
|
||||||
all_done_html = """
|
all_done_html = """
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
@ -663,11 +663,10 @@ def create_templates_directory():
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
with open(os.path.join(templates_dir, "review_latex.html"), "w") as f:
|
with open(os.path.join(templates_dir, "review_latex.html"), "w") as f:
|
||||||
f.write(review_html)
|
f.write(review_html)
|
||||||
|
|
||||||
with open(os.path.join(templates_dir, "all_done_latex.html"), "w") as f:
|
with open(os.path.join(templates_dir, "all_done_latex.html"), "w") as f:
|
||||||
f.write(all_done_html)
|
f.write(all_done_html)
|
||||||
|
|
||||||
@ -690,7 +689,6 @@ def main():
|
|||||||
print(f"Error: Dataset not found: {args.dataset_file}")
|
print(f"Error: Dataset not found: {args.dataset_file}")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file))
|
DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file))
|
||||||
DATASET_FILE = args.dataset_file
|
DATASET_FILE = args.dataset_file
|
||||||
|
|
||||||
@ -715,4 +713,4 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
|
Loading…
x
Reference in New Issue
Block a user