# import json # import random # import re # import argparse # def read_file_and_fix_json(file_path): # """ # Read the entire file and use a regex to extract JSON objects. # The expected format in the file is (possibly spanning multiple lines): # {"image": "1", "text": "some text here ..."} # This function returns a dict mapping pdf names (like "1.pdf") to the full text. # """ # print(f"Reading file: {file_path}") # try: # with open(file_path, 'r', encoding='utf-8') as file: # content = file.read() # document_texts = {} # # This regex looks for: {"image": "", "text": ""} # # Using DOTALL so that the text portion can include newline characters. # document_pattern = r'\{"image":\s*"(\d+)"\s*,\s*"text":\s*"(.*?)"\}' # matches = re.findall(document_pattern, content, re.DOTALL) # for image_num, text in matches: # pdf_name = f"{image_num}.pdf" # # Replace newlines (and subsequent spaces) with a single space # cleaned_text = re.sub(r'\n\s*', ' ', text) # document_texts[pdf_name] = cleaned_text # print(f"Found text for {pdf_name}") # return document_texts # except Exception as e: # print(f"Error reading file: {e}") # return {} # def get_random_sentence(text): # """ # Extract a random complete sentence from the provided text. # This function splits the text into sentences using punctuation (".", "!", or "?") # followed by whitespace. # """ # text = text.strip() # # Split text into sentences. The regex uses a positive lookbehind to keep the punctuation. # sentences = re.split(r'(?<=[.!?])\s+', text) # # Remove any empty strings # sentences = [s for s in sentences if s] # if not sentences: # return "" # return random.choice(sentences) # def process_jsonl(input_path, output_path): # """ # Process the malformed JSONL file: # 1. Extract complete JSON objects using a regex. # 2. For each document, extract one random complete sentence. # 3. Create a processed JSON object containing the fields: # - "pdf": the original PDF filename (e.g. "1.pdf") # - "page": set to 1 # - "id": constructed as "_processed01" # - "type": "present" # - "max_diffs": 2 # - "text": the randomly chosen sentence # - "case_sensitive": true # - "first_n": null # - "last_n": null # 4. Write the processed objects as a JSONL file. # """ # documents = read_file_and_fix_json(input_path) # if not documents: # print("No documents found in the file.") # return # output_lines = [] # for pdf_name, text in documents.items(): # random_sentence = get_random_sentence(text) # processed_obj = { # "pdf": pdf_name, # "page": 1, # "id": f"{pdf_name}_processed01", # "type": "present", # "max_diffs": 2, # "text": random_sentence, # "case_sensitive": True, # "first_n": None, # "last_n": None # } # output_lines.append(processed_obj) # with open(output_path, 'w', encoding='utf-8') as outfile: # for obj in output_lines: # outfile.write(json.dumps(obj) + "\n") # print(f"Processed {len(output_lines)} documents. Output written to {output_path}.") # def main(): # parser = argparse.ArgumentParser( # description="Process a malformed JSONL file and extract a random sentence from each document." # ) # parser.add_argument("input_file", help="Path to the input JSONL file") # parser.add_argument("output_file", help="Path to the output JSONL file") # args = parser.parse_args() # process_jsonl(args.input_file, args.output_file) # if __name__ == '__main__': # main() import json import random import re import argparse def read_file_and_fix_json(file_path): """ Read the entire file and use a regex to extract JSON objects. The expected format in the file is (possibly spanning multiple lines): {"image": "1", "text": "some text here ..."} This function returns a dict mapping pdf names (like "1.pdf") to the full text. """ print(f"Reading file: {file_path}") try: with open(file_path, 'r', encoding='utf-8') as file: content = file.read() document_texts = {} # This regex looks for: {"image": "", "text": ""} # DOTALL allows the text to span across multiple lines. document_pattern = r'\{"image":\s*"(\d+)"\s*,\s*"text":\s*"(.*?)"\}' matches = re.findall(document_pattern, content, re.DOTALL) for image_num, text in matches: pdf_name = f"{image_num}.pdf" # Replace newlines (and following spaces) with a single space cleaned_text = re.sub(r'\n\s*', ' ', text) document_texts[pdf_name] = cleaned_text print(f"Found text for {pdf_name}") return document_texts except Exception as e: print(f"Error reading file: {e}") return {} def split_into_sentences(text): """ Split text into sentences. A sentence is assumed to end with a period, exclamation mark, or question mark, followed by whitespace. """ text = text.strip() # Using positive lookbehind to keep the punctuation in sentences sentences = re.split(r'(?<=[.!?])\s+', text) # Filter out empty strings sentences = [s for s in sentences if s] return sentences def process_jsonl_for_order(input_path, output_path): """ Process the input JSONL file and generate ordering test objects. For each document, randomly select two complete sentences: - "before": a randomly chosen sentence. - "after": a different randomly chosen sentence. If only one sentence is present, both fields are set to that sentence. The output JSON object follows this format: { "pdf": "", "page": 1, "id": "_processed02", "type": "order", "before": "", "after": "", "max_diffs": 3 } """ documents = read_file_and_fix_json(input_path) if not documents: print("No documents found in the file.") return output_objects = [] for pdf_name, text in documents.items(): sentences = split_into_sentences(text) if not sentences: continue if len(sentences) < 2: # If only one sentence, use it for both before and after. before = after = sentences[0] else: # Randomly pick two different sentences. before, after = random.sample(sentences, 2) ordering_obj = { "pdf": pdf_name, "page": 1, "id": f"{pdf_name}_processed02", # e.g., "1.pdf_processed02" "type": "order", "before": before, "after": after, "max_diffs": 3 } output_objects.append(ordering_obj) print(f"Created ordering test for {pdf_name}") # Write the processed JSON objects as JSON lines. with open(output_path, 'w', encoding='utf-8') as outfile: for obj in output_objects: outfile.write(json.dumps(obj) + "\n") print(f"Processed {len(output_objects)} ordering tests. Output written to {output_path}.") def main(): parser = argparse.ArgumentParser( description="Generate ordering tests from a malformed JSONL file by extracting complete sentences." ) parser.add_argument("input_file", help="Path to the input JSONL file") parser.add_argument("output_file", help="Path to the output JSONL file for ordering tests") args = parser.parse_args() process_jsonl_for_order(args.input_file, args.output_file) if __name__ == '__main__': main()