mirror of
https://github.com/allenai/olmocr.git
synced 2025-07-31 21:13:57 +00:00
fixed style and lint
This commit is contained in:
parent
bc89f90216
commit
c72b8fb47c
@ -2,6 +2,7 @@ import json
|
||||
import random
|
||||
import re
|
||||
|
||||
|
||||
def extract_random_segment(text, min_words=7, max_words=15):
|
||||
"""Extract a random segment of 7-15 words from the text."""
|
||||
words = text.split()
|
||||
@ -13,11 +14,12 @@ def extract_random_segment(text, min_words=7, max_words=15):
|
||||
remaining_words = len(words) - start
|
||||
segment_length = random.randint(min_words, min(max_words, remaining_words))
|
||||
segment = words[start : start + segment_length]
|
||||
return ' '.join(segment)
|
||||
return " ".join(segment)
|
||||
|
||||
|
||||
def process_jsonl_file_present(input_file, output_file):
|
||||
"""Process a JSONL file and create multiple random cases for each PDF."""
|
||||
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
|
||||
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
|
||||
for line in infile:
|
||||
if line.strip(): # Skip empty lines
|
||||
data = json.loads(line)
|
||||
@ -40,14 +42,14 @@ def process_jsonl_file_present(input_file, output_file):
|
||||
"text": text_segment,
|
||||
"case_sensitive": True,
|
||||
"first_n": None,
|
||||
"last_n": None
|
||||
"last_n": None,
|
||||
}
|
||||
outfile.write(json.dumps(new_case) + '\n')
|
||||
outfile.write(json.dumps(new_case) + "\n")
|
||||
|
||||
|
||||
def extract_ordered_segments(text, min_words=7, max_words=15):
|
||||
"""Extract two ordered segments from the text."""
|
||||
sentences = re.split(r'(?<=[.!?])\s+', text)
|
||||
sentences = re.split(r"(?<=[.!?])\s+", text)
|
||||
|
||||
if len(sentences) < 2:
|
||||
return None, None
|
||||
@ -66,22 +68,23 @@ def extract_ordered_segments(text, min_words=7, max_words=15):
|
||||
if len(before_words) > max_words:
|
||||
start = random.randint(0, len(before_words) - min_words)
|
||||
length = random.randint(min_words, min(max_words, len(before_words) - start))
|
||||
before_segment = ' '.join(before_words[start:start + length])
|
||||
before_segment = " ".join(before_words[start : start + length])
|
||||
else:
|
||||
before_segment = before_sentence
|
||||
|
||||
if len(after_words) > max_words:
|
||||
start = random.randint(0, len(after_words) - min_words)
|
||||
length = random.randint(min_words, min(max_words, len(after_words) - start))
|
||||
after_segment = ' '.join(after_words[start:start + length])
|
||||
after_segment = " ".join(after_words[start : start + length])
|
||||
else:
|
||||
after_segment = after_sentence
|
||||
|
||||
return before_segment, after_segment
|
||||
|
||||
|
||||
def process_jsonl_file_order(input_file, output_file):
|
||||
"""Process a JSONL file and create order-type cases."""
|
||||
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
|
||||
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
|
||||
for line in infile:
|
||||
if line.strip(): # Skip empty lines
|
||||
data = json.loads(line)
|
||||
@ -106,10 +109,11 @@ def process_jsonl_file_order(input_file, output_file):
|
||||
"after": after_text,
|
||||
"max_diffs": max_diffs,
|
||||
"checked": "verified",
|
||||
"url": f"https://example.com/document/{image}"
|
||||
"url": f"https://example.com/document/{image}",
|
||||
}
|
||||
|
||||
outfile.write(json.dumps(new_case) + '\n')
|
||||
outfile.write(json.dumps(new_case) + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file = "olmoce/bench/sample_data/old_scans.jsonl"
|
||||
|
@ -664,7 +664,6 @@ def create_templates_directory():
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
with open(os.path.join(templates_dir, "review_latex.html"), "w") as f:
|
||||
f.write(review_html)
|
||||
|
||||
@ -690,7 +689,6 @@ def main():
|
||||
print(f"Error: Dataset not found: {args.dataset_file}")
|
||||
return 1
|
||||
|
||||
|
||||
DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file))
|
||||
DATASET_FILE = args.dataset_file
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
import httpx
|
||||
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
|
Loading…
x
Reference in New Issue
Block a user