| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  | # pyright: reportPrivateUsage=false | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | """Unit-test suite for the `unstructured.partition.lang` module.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from __future__ import annotations | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import pathlib | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 20:47:56 -05:00
										 |  |  | import pytest | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | from test_unstructured.unit_utils import LogCaptureFixture | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  | from unstructured.documents.elements import ( | 
					
						
							|  |  |  |     NarrativeText, | 
					
						
							|  |  |  |     PageBreak, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | from unstructured.partition.common.lang import ( | 
					
						
							| 
									
										
										
										
											2024-01-19 13:59:08 -06:00
										 |  |  |     _clean_ocr_languages_arg, | 
					
						
							| 
									
										
										
										
											2024-01-16 11:51:03 -06:00
										 |  |  |     _convert_language_code_to_pytesseract_lang_code, | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     apply_lang_metadata, | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  |     check_language_args, | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     detect_languages, | 
					
						
							|  |  |  |     prepare_languages_for_tesseract, | 
					
						
							| 
									
										
										
										
											2024-07-16 15:19:25 -07:00
										 |  |  |     tesseract_to_paddle_language, | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  | DIRECTORY = pathlib.Path(__file__).parent.resolve() | 
					
						
							|  |  |  | EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | def test_prepare_languages_for_tesseract_with_one_language(): | 
					
						
							|  |  |  |     languages = ["en"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert prepare_languages_for_tesseract(languages) == "eng" | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-06 20:30:12 -05:00
										 |  |  | def test_prepare_languages_for_tesseract_with_duplicated_languages(): | 
					
						
							|  |  |  |     languages = ["en", "eng"] | 
					
						
							|  |  |  |     assert prepare_languages_for_tesseract(languages) == "eng" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  | def test_prepare_languages_for_tesseract_special_case(): | 
					
						
							|  |  |  |     languages = ["osd"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert prepare_languages_for_tesseract(languages) == "osd" | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     languages = ["equ"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert prepare_languages_for_tesseract(languages) == "equ" | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_prepare_languages_for_tesseract_removes_empty_inputs(): | 
					
						
							|  |  |  |     languages = ["kbd", "es"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert prepare_languages_for_tesseract(languages) == "spa+spa_old" | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_prepare_languages_for_tesseract_includes_variants(): | 
					
						
							|  |  |  |     languages = ["chi"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_prepare_languages_for_tesseract_with_multiple_languages(): | 
					
						
							|  |  |  |     languages = ["ja", "afr", "en", "equ"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ" | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog: LogCaptureFixture): | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  |     languages = ["zzz", "chi"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  |     assert "not a valid standard language code" in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog: LogCaptureFixture): | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  |     languages = ["kbd", "eng"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert prepare_languages_for_tesseract(languages) == "eng" | 
					
						
							| 
									
										
										
										
											2023-09-18 11:42:02 -04:00
										 |  |  |     assert "not a language supported by Tesseract" in caplog.text | 
					
						
							| 
									
										
										
										
											2023-09-26 14:09:27 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-06 20:30:12 -05:00
										 |  |  | def test_prepare_languages_for_tesseract_None_languages(): | 
					
						
							|  |  |  |     with pytest.raises(ValueError, match="`languages` can not be `None`"): | 
					
						
							|  |  |  |         languages = None | 
					
						
							|  |  |  |         prepare_languages_for_tesseract(languages) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_prepare_languages_for_tesseract_no_valid_languages(caplog: LogCaptureFixture): | 
					
						
							| 
									
										
										
										
											2023-11-06 20:30:12 -05:00
										 |  |  |     languages = [""] | 
					
						
							|  |  |  |     assert prepare_languages_for_tesseract(languages) == "eng" | 
					
						
							|  |  |  |     assert "Failed to find any valid standard language code from languages" in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-16 15:19:25 -07:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("tesseract_lang", "expected_lang"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("eng", "en"), | 
					
						
							|  |  |  |         ("chi_sim", "ch"), | 
					
						
							|  |  |  |         ("chi_tra", "chinese_cht"), | 
					
						
							|  |  |  |         ("deu", "german"), | 
					
						
							|  |  |  |         ("jpn", "japan"), | 
					
						
							|  |  |  |         ("kor", "korean"), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_tesseract_to_paddle_language_valid_codes(tesseract_lang: str, expected_lang: str): | 
					
						
							| 
									
										
										
										
											2024-07-16 15:19:25 -07:00
										 |  |  |     assert expected_lang == tesseract_to_paddle_language(tesseract_lang) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_tesseract_to_paddle_language_invalid_codes(caplog: LogCaptureFixture): | 
					
						
							| 
									
										
										
										
											2024-07-16 15:19:25 -07:00
										 |  |  |     tesseract_lang = "unsupported_lang" | 
					
						
							|  |  |  |     assert tesseract_to_paddle_language(tesseract_lang) == "en" | 
					
						
							|  |  |  |     assert "unsupported_lang is not a language code supported by PaddleOCR," in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("tesseract_lang", "expected_lang"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("ENG", "en"), | 
					
						
							|  |  |  |         ("Fra", "fr"), | 
					
						
							|  |  |  |         ("DEU", "german"), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_tesseract_to_paddle_language_case_sensitivity(tesseract_lang: str, expected_lang: str): | 
					
						
							| 
									
										
										
										
											2024-07-16 15:19:25 -07:00
										 |  |  |     assert expected_lang == tesseract_to_paddle_language(tesseract_lang) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-26 14:09:27 -04:00
										 |  |  | def test_detect_languages_english_auto(): | 
					
						
							|  |  |  |     text = "This is a short sentence." | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert detect_languages(text) == ["eng"] | 
					
						
							| 
									
										
										
										
											2023-09-26 14:09:27 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_detect_languages_english_provided(): | 
					
						
							|  |  |  |     text = "This is another short sentence." | 
					
						
							|  |  |  |     languages = ["en"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert detect_languages(text, languages) == ["eng"] | 
					
						
							| 
									
										
										
										
											2023-09-26 14:09:27 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_detect_languages_korean_auto(): | 
					
						
							|  |  |  |     text = "안녕하세요" | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert detect_languages(text) == ["kor"] | 
					
						
							| 
									
										
										
										
											2023-09-26 14:09:27 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_detect_languages_gets_multiple_languages(): | 
					
						
							|  |  |  |     text = "My lubimy mleko i chleb." | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert detect_languages(text) == ["ces", "pol", "slk"] | 
					
						
							| 
									
										
										
										
											2023-09-26 14:09:27 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_detect_languages_warns_for_auto_and_other_input(caplog: LogCaptureFixture): | 
					
						
							| 
									
										
										
										
											2023-09-26 14:09:27 -04:00
										 |  |  |     text = "This is another short sentence." | 
					
						
							|  |  |  |     languages = ["en", "auto", "rus"] | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     assert detect_languages(text, languages) == ["eng"] | 
					
						
							| 
									
										
										
										
											2023-09-26 14:09:27 -04:00
										 |  |  |     assert "rest of the inputted languages will be ignored" in caplog.text | 
					
						
							| 
									
										
										
										
											2023-10-10 20:47:56 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_detect_languages_raises_TypeError_for_invalid_languages(): | 
					
						
							|  |  |  |     with pytest.raises(TypeError): | 
					
						
							|  |  |  |         text = "This is a short sentence." | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  |         detect_languages(text, languages="eng") == ["eng"]  # type: ignore | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog: LogCaptureFixture): | 
					
						
							| 
									
										
										
										
											2023-10-19 23:15:28 -05:00
										 |  |  |     elements = [NarrativeText("Sample text."), PageBreak("")] | 
					
						
							|  |  |  |     elements = list( | 
					
						
							|  |  |  |         apply_lang_metadata( | 
					
						
							|  |  |  |             elements=elements, | 
					
						
							|  |  |  |             languages=["auto"], | 
					
						
							|  |  |  |             detect_language_per_element=True, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert "No features in text." not in [rec.message for rec in caplog.records] | 
					
						
							| 
									
										
										
										
											2024-01-10 18:34:13 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-16 11:51:03 -06:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("lang_in", "expected_lang"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("en", "eng"), | 
					
						
							|  |  |  |         ("fr", "fra"), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_convert_language_code_to_pytesseract_lang_code(lang_in: str, expected_lang: str): | 
					
						
							| 
									
										
										
										
											2024-01-16 11:51:03 -06:00
										 |  |  |     assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-19 13:59:08 -06:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("input_ocr_langs", "expected"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         (["eng"], "eng"),  # list | 
					
						
							|  |  |  |         ('"deu"', "deu"),  # extra quotation marks | 
					
						
							|  |  |  |         ("[deu]", "deu"),  # brackets | 
					
						
							|  |  |  |         ("['deu']", "deu"),  # brackets and quotation marks | 
					
						
							|  |  |  |         (["[deu]"], "deu"),  # list, brackets and quotation marks | 
					
						
							|  |  |  |         (['"deu"'], "deu"),  # list and quotation marks | 
					
						
							|  |  |  |         ("deu+spa", "deu+spa"),  # correct input | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_clean_ocr_languages_arg(input_ocr_langs: str, expected: str): | 
					
						
							| 
									
										
										
										
											2024-01-19 13:59:08 -06:00
										 |  |  |     assert _clean_ocr_languages_arg(input_ocr_langs) == expected | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-18 18:14:45 -06:00
										 |  |  | def test_detect_languages_handles_spelled_out_languages(): | 
					
						
							|  |  |  |     languages = detect_languages(text="Sample text longer than 5 words.", languages=["Spanish"]) | 
					
						
							|  |  |  |     assert languages == ["spa"] | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("languages", "ocr_languages", "expected_langs"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         (["spa"], "deu", ["spa"]), | 
					
						
							|  |  |  |         (["spanish"], "english", ["spa"]), | 
					
						
							|  |  |  |         (["spa"], "[deu]", ["spa"]), | 
					
						
							|  |  |  |         (["spa"], '"deu"', ["spa"]), | 
					
						
							|  |  |  |         (["spa"], ["deu"], ["spa"]), | 
					
						
							|  |  |  |         (["spa"], ["[deu]"], ["spa"]), | 
					
						
							|  |  |  |         (["spa+deu"], "eng+deu", ["spa", "deu"]), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_defined( | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  |     languages: list[str], | 
					
						
							|  |  |  |     ocr_languages: list[str] | str, | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  |     expected_langs: list[str], | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  |     caplog: LogCaptureFixture, | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  |     returned_langs = check_language_args( | 
					
						
							|  |  |  |         languages=languages, | 
					
						
							|  |  |  |         ocr_languages=ocr_languages, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  |     for lang in returned_langs:  # type: ignore | 
					
						
							|  |  |  |         assert lang in expected_langs | 
					
						
							|  |  |  |         assert "ocr_languages" in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("languages", "ocr_languages", "expected_langs"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         # raise warning and use `ocr_languages` when `languages` is empty or None | 
					
						
							|  |  |  |         ([], "deu", ["deu"]), | 
					
						
							|  |  |  |         ([""], '"deu"', ["deu"]), | 
					
						
							|  |  |  |         ([""], "deu", ["deu"]), | 
					
						
							|  |  |  |         ([""], "[deu]", ["deu"]), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None( | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  |     languages: list[str], | 
					
						
							|  |  |  |     ocr_languages: str, | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  |     expected_langs: list[str], | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  |     caplog: LogCaptureFixture, | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  | ): | 
					
						
							|  |  |  |     returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) | 
					
						
							|  |  |  |     for lang in returned_langs:  # type: ignore | 
					
						
							|  |  |  |         assert lang in expected_langs | 
					
						
							|  |  |  |         assert "ocr_languages" in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("languages", "ocr_languages"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ([], None),  # how check_language_args is called from auto.partition() | 
					
						
							|  |  |  |         ([""], None), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_check_language_args_returns_None( | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  |     languages: list[str], | 
					
						
							|  |  |  |     ocr_languages: None, | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  | ): | 
					
						
							|  |  |  |     returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) | 
					
						
							|  |  |  |     assert returned_langs is None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  | def test_check_language_args_returns_auto(): | 
					
						
							|  |  |  |     assert check_language_args(languages=["eng", "spa", "auto"], ocr_languages=None) == ["auto"] | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("languages", "ocr_languages"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ([], ["auto"]), | 
					
						
							|  |  |  |         ([""], "eng+auto"), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_check_language_args_raises_error_when_ocr_languages_contains_auto( | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  |     languages: list[str], | 
					
						
							|  |  |  |     ocr_languages: str | list[str], | 
					
						
							| 
									
										
										
										
											2024-01-29 14:12:08 -06:00
										 |  |  | ): | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							| 
									
										
										
										
											2024-09-25 16:15:50 -07:00
										 |  |  |         check_language_args( | 
					
						
							|  |  |  |             languages=languages, | 
					
						
							|  |  |  |             ocr_languages=ocr_languages, | 
					
						
							|  |  |  |         ) |