mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-31 18:15:44 +00:00 
			
		
		
		
	Merge branch 'main' into jakep/olmocr_v03
This commit is contained in:
		
						commit
						3ae173bd72
					
				| @ -155,11 +155,15 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination: | ||||
|                     f.write("---\n") | ||||
|                     for k, v in front_matter.items(): | ||||
|                         f.write(f"{k}: {v}\n") | ||||
|                     f.write("---\n") | ||||
| 
 | ||||
|                     # Write natural text | ||||
|                     f.write(natural_text) | ||||
|                     if natural_text is not None and len(natural_text.strip()) > 0: | ||||
|                         f.write("---\n") | ||||
| 
 | ||||
|                         # Write natural text | ||||
|                         f.write(natural_text) | ||||
|                     else: | ||||
|                         f.write("---") | ||||
|                          | ||||
|                 # Look for matching PDF in extracted directory and create symlinks | ||||
|                 extracted_pdfs_dir = dest_path / "hugging_face" / "pdf_tarballs" / "extracted" | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Jake Poznanski
						Jake Poznanski