| 
									
										
										
										
											2025-04-04 12:52:58 +01:00
										 |  |  | import json | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import sys | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-04 19:45:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-04 12:52:58 +01:00
										 |  |  | # This is a simple script to convert JSONL files to Markdown format. | 
					
						
							|  |  |  | # It reads each line of the JSONL file, extracts the 'text' field, | 
					
						
							|  |  |  | # and saves it as a Markdown file with the line number as the filename. | 
					
						
							|  |  |  | # The script also handles potential JSON decoding errors and prints relevant messages. | 
					
						
							|  |  |  | def jsonl_to_markdown(input_file, output_dir): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Reads a JSONL file, extracts the 'text' field from each line, and saves it as a Markdown file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         input_file (str): Path to the input JSONL file. | 
					
						
							|  |  |  |         output_dir (str): Directory to save the Markdown files. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not os.path.exists(output_dir): | 
					
						
							|  |  |  |         os.makedirs(output_dir) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-04 19:45:38 +00:00
										 |  |  |     with open(input_file, "r", encoding="utf-8") as file: | 
					
						
							| 
									
										
										
										
											2025-04-04 12:52:58 +01:00
										 |  |  |         for i, line in enumerate(file): | 
					
						
							|  |  |  |             try: | 
					
						
							|  |  |  |                 # Parse the JSON line | 
					
						
							|  |  |  |                 data = json.loads(line) | 
					
						
							|  |  |  |                 text_content = data.get("text", "") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # Save to a Markdown file | 
					
						
							|  |  |  |                 output_file = os.path.join(output_dir, f"line_{i + 1}.md") | 
					
						
							| 
									
										
										
										
											2025-04-04 19:45:38 +00:00
										 |  |  |                 with open(output_file, "w", encoding="utf-8") as md_file: | 
					
						
							| 
									
										
										
										
											2025-04-04 16:44:21 +01:00
										 |  |  |                     md_file.write(text_content) | 
					
						
							| 
									
										
										
										
											2025-04-04 12:52:58 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |                 print(f"Extracted and saved line {i + 1} to {output_file}") | 
					
						
							|  |  |  |             except json.JSONDecodeError as e: | 
					
						
							|  |  |  |                 print(f"Error decoding JSON on line {i + 1}: {e}") | 
					
						
							|  |  |  |             except Exception as e: | 
					
						
							|  |  |  |                 print(f"Unexpected error on line {i + 1}: {e}") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-04 19:45:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-04 12:52:58 +01:00
										 |  |  | # Example usage | 
					
						
							|  |  |  | # input_jsonl_file = "/path/to/test.jsonl"  # Replace with the actual path to your JSONL file | 
					
						
							|  |  |  | # output_directory = "/path/to/output_markdown"  # Replace with the desired output directory | 
					
						
							|  |  |  | # jsonl_to_markdown(input_jsonl_file, output_directory) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # This is the main entrypoint to use the script from the command line. | 
					
						
							|  |  |  | # It takes two arguments: the input JSONL file and the output directory. | 
					
						
							|  |  |  | # The script will create the output directory if it does not exist. | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-04 19:45:38 +00:00
										 |  |  |     if len(sys.argv) != 3: | 
					
						
							|  |  |  |         print("Usage: python jsonl_to_markdown.py <input_file> <output_dir>") | 
					
						
							|  |  |  |         sys.exit(1) | 
					
						
							| 
									
										
										
										
											2025-04-04 12:52:58 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-04 19:45:38 +00:00
										 |  |  |     input_file = sys.argv[1] | 
					
						
							|  |  |  |     output_dir = sys.argv[2] | 
					
						
							| 
									
										
										
										
											2025-04-04 12:52:58 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-04-04 19:45:38 +00:00
										 |  |  |     jsonl_to_markdown(input_file, output_dir) |