mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	 32bfebccf7
			
		
	
	
		32bfebccf7
		
			
		
	
	
	
	
		
			
			### Summary Uses `langdetect` to detect all languages present in the input document. ### Details - Converts all language codes (whether user inputted or detected using `langdetect`) to a standard ISO 639-3 code. - Adds `languages` field to the metadata - Will revisit how to nonstandardly represent simplified vs traditional Chinese scripts internally (separate PR). - Update ingest test results to add `languages` field to documents. Some other side effects are changes in order of some elements and changes in element categorization ### Test You can test the detect_languages function individually by importing the function and inputting a text sample and optionally a language: ``` text = "My lubimy mleko i chleb." doc_langs = detect_languages(text) print(doc_langs) ``` -> ['ces', 'pol', 'slk'] --------- Co-authored-by: Newel H <37004249+newelh@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com> Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com> Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com>
		
			
				
	
	
		
			76 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			76 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from unstructured.partition import lang
 | |
| 
 | |
| 
 | |
| def test_prepare_languages_for_tesseract_with_one_language():
 | |
|     languages = ["en"]
 | |
|     assert lang.prepare_languages_for_tesseract(languages) == "eng"
 | |
| 
 | |
| 
 | |
| def test_prepare_languages_for_tesseract_special_case():
 | |
|     languages = ["osd"]
 | |
|     assert lang.prepare_languages_for_tesseract(languages) == "osd"
 | |
| 
 | |
|     languages = ["equ"]
 | |
|     assert lang.prepare_languages_for_tesseract(languages) == "equ"
 | |
| 
 | |
| 
 | |
| def test_prepare_languages_for_tesseract_removes_empty_inputs():
 | |
|     languages = ["kbd", "es"]
 | |
|     assert lang.prepare_languages_for_tesseract(languages) == "spa+spa_old"
 | |
| 
 | |
| 
 | |
| def test_prepare_languages_for_tesseract_includes_variants():
 | |
|     languages = ["chi"]
 | |
|     assert (
 | |
|         lang.prepare_languages_for_tesseract(languages)
 | |
|         == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
 | |
|     )
 | |
| 
 | |
| 
 | |
| def test_prepare_languages_for_tesseract_with_multiple_languages():
 | |
|     languages = ["ja", "afr", "en", "equ"]
 | |
|     assert lang.prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ"
 | |
| 
 | |
| 
 | |
| def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog):
 | |
|     languages = ["zzz", "chi"]
 | |
|     assert (
 | |
|         lang.prepare_languages_for_tesseract(languages)
 | |
|         == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
 | |
|     )
 | |
|     assert "not a valid standard language code" in caplog.text
 | |
| 
 | |
| 
 | |
| def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog):
 | |
|     languages = ["kbd", "eng"]
 | |
|     assert lang.prepare_languages_for_tesseract(languages) == "eng"
 | |
|     assert "not a language supported by Tesseract" in caplog.text
 | |
| 
 | |
| 
 | |
| def test_detect_languages_english_auto():
 | |
|     text = "This is a short sentence."
 | |
|     assert lang.detect_languages(text) == ["eng"]
 | |
| 
 | |
| 
 | |
| def test_detect_languages_english_provided():
 | |
|     text = "This is another short sentence."
 | |
|     languages = ["en"]
 | |
|     assert lang.detect_languages(text, languages) == ["eng"]
 | |
| 
 | |
| 
 | |
| def test_detect_languages_korean_auto():
 | |
|     text = "안녕하세요"
 | |
|     assert lang.detect_languages(text) == ["kor"]
 | |
| 
 | |
| 
 | |
| def test_detect_languages_gets_multiple_languages():
 | |
|     text = "My lubimy mleko i chleb."
 | |
|     assert lang.detect_languages(text) == ["ces", "pol", "slk"]
 | |
| 
 | |
| 
 | |
| def test_detect_languages_warns_for_auto_and_other_input(caplog):
 | |
|     text = "This is another short sentence."
 | |
|     languages = ["en", "auto", "rus"]
 | |
|     assert lang.detect_languages(text, languages) == ["eng"]
 | |
|     assert "rest of the inputted languages will be ignored" in caplog.text
 |