mirror of
				https://github.com/rasbt/LLMs-from-scratch.git
				synced 2025-10-31 09:50:23 +00:00 
			
		
		
		
	Update bpe_openai_gpt2.py
signficant -> significant
This commit is contained in:
		
							parent
							
								
									f6e80a376d
								
							
						
					
					
						commit
						3a3a4ac1f1
					
				| @ -48,7 +48,7 @@ def bytes_to_unicode(): | ||||
|     The reversible bpe codes work on unicode strings. | ||||
|     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. | ||||
|     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. | ||||
|     This is a signficant percentage of your normal, say, 32K bpe vocab. | ||||
|     This is a significant percentage of your normal, say, 32K bpe vocab. | ||||
|     To avoid that, we want lookup tables between utf-8 bytes and unicode strings. | ||||
|     And avoids mapping to whitespace/control characters the bpe code barfs on. | ||||
|     """ | ||||
| @ -171,4 +171,4 @@ def download_vocab(): | ||||
|                 # 1k for chunk_size, since Ethernet packet size is around 1500 bytes | ||||
|                 for chunk in r.iter_content(chunk_size=chunk_size): | ||||
|                     f.write(chunk) | ||||
|                     pbar.update(chunk_size) | ||||
|                     pbar.update(chunk_size) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Ikko Eltociear Ashimine
						Ikko Eltociear Ashimine