mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-24 22:49:00 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			58 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			58 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import json
 | |
| import os
 | |
| import sys
 | |
| 
 | |
| 
 | |
| # This is a simple script to convert JSONL files to Markdown format.
 | |
| # It reads each line of the JSONL file, extracts the 'text' field,
 | |
| # and saves it as a Markdown file with the line number as the filename.
 | |
| # The script also handles potential JSON decoding errors and prints relevant messages.
 | |
| def jsonl_to_markdown(input_file, output_dir):
 | |
|     """
 | |
|     Reads a JSONL file, extracts the 'text' field from each line, and saves it as a Markdown file.
 | |
| 
 | |
|     Args:
 | |
|         input_file (str): Path to the input JSONL file.
 | |
|         output_dir (str): Directory to save the Markdown files.
 | |
|     """
 | |
|     if not os.path.exists(output_dir):
 | |
|         os.makedirs(output_dir)
 | |
| 
 | |
|     with open(input_file, "r", encoding="utf-8") as file:
 | |
|         for i, line in enumerate(file):
 | |
|             try:
 | |
|                 # Parse the JSON line
 | |
|                 data = json.loads(line)
 | |
|                 text_content = data.get("text", "")
 | |
| 
 | |
|                 # Save to a Markdown file
 | |
|                 output_file = os.path.join(output_dir, f"line_{i + 1}.md")
 | |
|                 with open(output_file, "w", encoding="utf-8") as md_file:
 | |
|                     md_file.write(text_content)
 | |
| 
 | |
|                 print(f"Extracted and saved line {i + 1} to {output_file}")
 | |
|             except json.JSONDecodeError as e:
 | |
|                 print(f"Error decoding JSON on line {i + 1}: {e}")
 | |
|             except Exception as e:
 | |
|                 print(f"Unexpected error on line {i + 1}: {e}")
 | |
| 
 | |
| 
 | |
| # Example usage
 | |
| # input_jsonl_file = "/path/to/test.jsonl"  # Replace with the actual path to your JSONL file
 | |
| # output_directory = "/path/to/output_markdown"  # Replace with the desired output directory
 | |
| # jsonl_to_markdown(input_jsonl_file, output_directory)
 | |
| 
 | |
| # This is the main entrypoint to use the script from the command line.
 | |
| # It takes two arguments: the input JSONL file and the output directory.
 | |
| # The script will create the output directory if it does not exist.
 | |
| if __name__ == "__main__":
 | |
| 
 | |
|     if len(sys.argv) != 3:
 | |
|         print("Usage: python jsonl_to_markdown.py <input_file> <output_dir>")
 | |
|         sys.exit(1)
 | |
| 
 | |
|     input_file = sys.argv[1]
 | |
|     output_dir = sys.argv[2]
 | |
| 
 | |
|     jsonl_to_markdown(input_file, output_dir)
 | 
