mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-14 09:42:47 +00:00
230 lines
7.9 KiB
Python
230 lines
7.9 KiB
Python
# import json
|
|
# import random
|
|
# import re
|
|
# import argparse
|
|
|
|
# def read_file_and_fix_json(file_path):
|
|
# """
|
|
# Read the entire file and use a regex to extract JSON objects.
|
|
|
|
# The expected format in the file is (possibly spanning multiple lines):
|
|
# {"image": "1", "text": "some text here ..."}
|
|
|
|
# This function returns a dict mapping pdf names (like "1.pdf") to the full text.
|
|
# """
|
|
# print(f"Reading file: {file_path}")
|
|
# try:
|
|
# with open(file_path, 'r', encoding='utf-8') as file:
|
|
# content = file.read()
|
|
|
|
# document_texts = {}
|
|
# # This regex looks for: {"image": "<number>", "text": "<text>"}
|
|
# # Using DOTALL so that the text portion can include newline characters.
|
|
# document_pattern = r'\{"image":\s*"(\d+)"\s*,\s*"text":\s*"(.*?)"\}'
|
|
# matches = re.findall(document_pattern, content, re.DOTALL)
|
|
|
|
# for image_num, text in matches:
|
|
# pdf_name = f"{image_num}.pdf"
|
|
# # Replace newlines (and subsequent spaces) with a single space
|
|
# cleaned_text = re.sub(r'\n\s*', ' ', text)
|
|
# document_texts[pdf_name] = cleaned_text
|
|
# print(f"Found text for {pdf_name}")
|
|
|
|
# return document_texts
|
|
# except Exception as e:
|
|
# print(f"Error reading file: {e}")
|
|
# return {}
|
|
|
|
# def get_random_sentence(text):
|
|
# """
|
|
# Extract a random complete sentence from the provided text.
|
|
|
|
# This function splits the text into sentences using punctuation (".", "!", or "?")
|
|
# followed by whitespace.
|
|
# """
|
|
# text = text.strip()
|
|
# # Split text into sentences. The regex uses a positive lookbehind to keep the punctuation.
|
|
# sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
# # Remove any empty strings
|
|
# sentences = [s for s in sentences if s]
|
|
# if not sentences:
|
|
# return ""
|
|
# return random.choice(sentences)
|
|
|
|
# def process_jsonl(input_path, output_path):
|
|
# """
|
|
# Process the malformed JSONL file:
|
|
|
|
# 1. Extract complete JSON objects using a regex.
|
|
# 2. For each document, extract one random complete sentence.
|
|
# 3. Create a processed JSON object containing the fields:
|
|
# - "pdf": the original PDF filename (e.g. "1.pdf")
|
|
# - "page": set to 1
|
|
# - "id": constructed as "<pdf>_processed01"
|
|
# - "type": "present"
|
|
# - "max_diffs": 2
|
|
# - "text": the randomly chosen sentence
|
|
# - "case_sensitive": true
|
|
# - "first_n": null
|
|
# - "last_n": null
|
|
# 4. Write the processed objects as a JSONL file.
|
|
# """
|
|
# documents = read_file_and_fix_json(input_path)
|
|
# if not documents:
|
|
# print("No documents found in the file.")
|
|
# return
|
|
|
|
# output_lines = []
|
|
# for pdf_name, text in documents.items():
|
|
# random_sentence = get_random_sentence(text)
|
|
# processed_obj = {
|
|
# "pdf": pdf_name,
|
|
# "page": 1,
|
|
# "id": f"{pdf_name}_processed01",
|
|
# "type": "present",
|
|
# "max_diffs": 2,
|
|
# "text": random_sentence,
|
|
# "case_sensitive": True,
|
|
# "first_n": None,
|
|
# "last_n": None
|
|
# }
|
|
# output_lines.append(processed_obj)
|
|
|
|
# with open(output_path, 'w', encoding='utf-8') as outfile:
|
|
# for obj in output_lines:
|
|
# outfile.write(json.dumps(obj) + "\n")
|
|
|
|
# print(f"Processed {len(output_lines)} documents. Output written to {output_path}.")
|
|
|
|
# def main():
|
|
# parser = argparse.ArgumentParser(
|
|
# description="Process a malformed JSONL file and extract a random sentence from each document."
|
|
# )
|
|
# parser.add_argument("input_file", help="Path to the input JSONL file")
|
|
# parser.add_argument("output_file", help="Path to the output JSONL file")
|
|
# args = parser.parse_args()
|
|
|
|
# process_jsonl(args.input_file, args.output_file)
|
|
|
|
# if __name__ == '__main__':
|
|
# main()
|
|
|
|
import json
|
|
import random
|
|
import re
|
|
import argparse
|
|
|
|
def read_file_and_fix_json(file_path):
|
|
"""
|
|
Read the entire file and use a regex to extract JSON objects.
|
|
|
|
The expected format in the file is (possibly spanning multiple lines):
|
|
{"image": "1", "text": "some text here ..."}
|
|
|
|
This function returns a dict mapping pdf names (like "1.pdf") to the full text.
|
|
"""
|
|
print(f"Reading file: {file_path}")
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
content = file.read()
|
|
|
|
document_texts = {}
|
|
# This regex looks for: {"image": "<number>", "text": "<text>"}
|
|
# DOTALL allows the text to span across multiple lines.
|
|
document_pattern = r'\{"image":\s*"(\d+)"\s*,\s*"text":\s*"(.*?)"\}'
|
|
matches = re.findall(document_pattern, content, re.DOTALL)
|
|
|
|
for image_num, text in matches:
|
|
pdf_name = f"{image_num}.pdf"
|
|
# Replace newlines (and following spaces) with a single space
|
|
cleaned_text = re.sub(r'\n\s*', ' ', text)
|
|
document_texts[pdf_name] = cleaned_text
|
|
print(f"Found text for {pdf_name}")
|
|
|
|
return document_texts
|
|
except Exception as e:
|
|
print(f"Error reading file: {e}")
|
|
return {}
|
|
|
|
def split_into_sentences(text):
|
|
"""
|
|
Split text into sentences. A sentence is assumed to end with a period, exclamation mark, or question mark,
|
|
followed by whitespace.
|
|
"""
|
|
text = text.strip()
|
|
# Using positive lookbehind to keep the punctuation in sentences
|
|
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
# Filter out empty strings
|
|
sentences = [s for s in sentences if s]
|
|
return sentences
|
|
|
|
def process_jsonl_for_order(input_path, output_path):
|
|
"""
|
|
Process the input JSONL file and generate ordering test objects.
|
|
|
|
For each document, randomly select two complete sentences:
|
|
- "before": a randomly chosen sentence.
|
|
- "after": a different randomly chosen sentence.
|
|
|
|
If only one sentence is present, both fields are set to that sentence.
|
|
|
|
The output JSON object follows this format:
|
|
{
|
|
"pdf": "<pdf_name>",
|
|
"page": 1,
|
|
"id": "<pdf_name>_processed02",
|
|
"type": "order",
|
|
"before": "<sentence>",
|
|
"after": "<sentence>",
|
|
"max_diffs": 3
|
|
}
|
|
"""
|
|
documents = read_file_and_fix_json(input_path)
|
|
if not documents:
|
|
print("No documents found in the file.")
|
|
return
|
|
|
|
output_objects = []
|
|
for pdf_name, text in documents.items():
|
|
sentences = split_into_sentences(text)
|
|
if not sentences:
|
|
continue
|
|
|
|
if len(sentences) < 2:
|
|
# If only one sentence, use it for both before and after.
|
|
before = after = sentences[0]
|
|
else:
|
|
# Randomly pick two different sentences.
|
|
before, after = random.sample(sentences, 2)
|
|
|
|
ordering_obj = {
|
|
"pdf": pdf_name,
|
|
"page": 1,
|
|
"id": f"{pdf_name}_processed02", # e.g., "1.pdf_processed02"
|
|
"type": "order",
|
|
"before": before,
|
|
"after": after,
|
|
"max_diffs": 3
|
|
}
|
|
output_objects.append(ordering_obj)
|
|
print(f"Created ordering test for {pdf_name}")
|
|
|
|
# Write the processed JSON objects as JSON lines.
|
|
with open(output_path, 'w', encoding='utf-8') as outfile:
|
|
for obj in output_objects:
|
|
outfile.write(json.dumps(obj) + "\n")
|
|
|
|
print(f"Processed {len(output_objects)} ordering tests. Output written to {output_path}.")
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate ordering tests from a malformed JSONL file by extracting complete sentences."
|
|
)
|
|
parser.add_argument("input_file", help="Path to the input JSONL file")
|
|
parser.add_argument("output_file", help="Path to the output JSONL file for ordering tests")
|
|
args = parser.parse_args()
|
|
|
|
process_jsonl_for_order(args.input_file, args.output_file)
|
|
|
|
if __name__ == '__main__':
|
|
main() |