diff --git a/CHANGELOG.md b/CHANGELOG.md index c97ad4d10..cd4fd2d79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ### Enhancements +* **Add a function to map between Tesseract and standard language codes.** This allows users to input language information to the `languages` param in any Tesseract-supported langcode or any ISO 639 standard language code. + ### Features ### Fixes @@ -62,7 +64,7 @@ * Update all connectors to use new downstream architecture * New click type added to parse comma-delimited string inputs * Some CLI options renamed - + ### Features ### Fixes diff --git a/example-docs/chi_sim_image.jpeg b/example-docs/chi_sim_image.jpeg new file mode 100644 index 000000000..1c6c37ab9 Binary files /dev/null and b/example-docs/chi_sim_image.jpeg differ diff --git a/example-docs/jpn-vert.jpeg b/example-docs/jpn-vert.jpeg new file mode 100644 index 000000000..4eb31e343 Binary files /dev/null and b/example-docs/jpn-vert.jpeg differ diff --git a/requirements/base.in b/requirements/base.in index e1aa14f79..11a99158f 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -9,3 +9,4 @@ requests beautifulsoup4 emoji dataclasses-json +python-iso639 \ No newline at end of file diff --git a/requirements/base.txt b/requirements/base.txt index 300e1df49..69faa127f 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -36,6 +36,8 @@ nltk==3.8.1 # via -r requirements/base.in packaging==23.1 # via marshmallow +python-iso639==2023.6.15 + # via -r requirements/base.in python-magic==0.4.27 # via -r requirements/base.in regex==2023.8.8 diff --git a/requirements/dev.txt b/requirements/dev.txt index 0845c4db0..ecd2113f2 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -207,7 +207,7 @@ nbformat==5.9.2 # jupyter-server # nbclient # nbconvert -nest-asyncio==1.5.7 +nest-asyncio==1.5.8 # via ipykernel nodeenv==1.8.0 # via pre-commit diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index afd4927a4..f4af3eed1 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -27,7 +27,7 @@ click==8.1.7 # via # -c requirements/base.txt # flask -contourpy==1.1.0 +contourpy==1.1.1 # via matplotlib cssselect==1.2.0 # via premailer @@ -148,7 +148,7 @@ psutil==5.9.5 # via visualdl pyclipper==1.3.0.post5 # via unstructured-paddleocr -pycryptodome==3.18.0 +pycryptodome==3.19.0 # via bce-python-sdk pyparsing==3.0.9 # via diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index eba882b77..16a2a2a10 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -20,7 +20,7 @@ charset-normalizer==3.2.0 # requests coloredlogs==15.0.1 # via onnxruntime -contourpy==1.1.0 +contourpy==1.1.1 # via matplotlib cryptography==41.0.3 # via pdfminer-six @@ -124,7 +124,7 @@ pillow==10.0.1 # pytesseract # torchvision # unstructured-pytesseract -portalocker==2.7.0 +portalocker==2.8.2 # via iopath protobuf==4.23.4 # via diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index 9fc26fff1..871e9df8a 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -4,7 +4,7 @@ # # pip-compile requirements/ingest-azure.in # -adlfs==2023.8.0 +adlfs==2023.9.0 # via -r requirements/ingest-azure.in aiohttp==3.8.5 # via adlfs @@ -71,7 +71,7 @@ multidict==6.0.4 # via # aiohttp # yarl -portalocker==2.7.0 +portalocker==2.8.2 # via msal-extensions pycparser==2.21 # via cffi diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py index dbef302a7..ce026767a 100644 --- a/test_unstructured/partition/pdf-image/test_image.py +++ b/test_unstructured/partition/pdf-image/test_image.py @@ -430,6 +430,20 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename( assert point[1] is not math.nan +def test_partition_image_formats_languages_for_tesseract(): + filename = "example-docs/jpn-vert.jpeg" + with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process: + image.partition_image(filename=filename, strategy="hi_res", languages=["jpn_vert"]) + mock_process.assert_called_once_with( + filename, + is_image=True, + ocr_languages="jpn_vert", + ocr_mode="entire_page", + extract_tables=False, + model_name=None, + ) + + def test_partition_image_warns_with_ocr_languages(caplog): filename = "example-docs/layout-parser-paper-fast.jpg" image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng") diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index 6f30925cd..a3209a9cf 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -840,6 +840,20 @@ def test_add_chunking_strategy_on_partition_pdf( assert chunk_elements == chunks +def test_partition_pdf_formats_languages_for_tesseract(): + filename = "example-docs/DA-1p.pdf" + with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process: + pdf.partition_pdf(filename=filename, strategy="hi_res", languages=["en"]) + mock_process.assert_called_once_with( + filename, + is_image=False, + ocr_languages="eng", + ocr_mode="entire_page", + extract_tables=False, + model_name=None, + ) + + def test_partition_pdf_warns_with_ocr_languages(caplog): filename = "example-docs/chevron-page.pdf" pdf.partition_pdf(filename=filename, strategy="hi_res", ocr_languages="eng") diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 4b1d7cb2c..ab23d93d7 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -369,6 +369,22 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ assert elements[1].text.startswith("Zejiang Shen") +def test_auto_partition_formats_languages_for_tesseract(): + filename = "example-docs/chi_sim_image.jpeg" + with patch( + "unstructured_inference.inference.layout.process_file_with_model", + ) as mock_process_file_with_model: + partition(filename, strategy="hi_res", languages=["zh"]) + mock_process_file_with_model.assert_called_once_with( + filename, + is_image=True, + ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert", + ocr_mode="entire_page", + extract_tables=False, + model_name=None, + ) + + def test_auto_partition_warns_with_ocr_languages(caplog): filename = "example-docs/chevron-page.pdf" partition(filename=filename, strategy="hi_res", ocr_languages="eng") diff --git a/test_unstructured/partition/test_lang.py b/test_unstructured/partition/test_lang.py new file mode 100644 index 000000000..4aa9219f8 --- /dev/null +++ b/test_unstructured/partition/test_lang.py @@ -0,0 +1,47 @@ +from unstructured.partition import lang + + +def test_prepare_languages_for_tesseract_with_one_language(): + languages = ["en"] + assert lang.prepare_languages_for_tesseract(languages) == "eng" + + +def test_prepare_languages_for_tesseract_special_case(): + languages = ["osd"] + assert lang.prepare_languages_for_tesseract(languages) == "osd" + + languages = ["equ"] + assert lang.prepare_languages_for_tesseract(languages) == "equ" + + +def test_prepare_languages_for_tesseract_removes_empty_inputs(): + languages = ["kbd", "es"] + assert lang.prepare_languages_for_tesseract(languages) == "spa+spa_old" + + +def test_prepare_languages_for_tesseract_includes_variants(): + languages = ["chi"] + assert ( + lang.prepare_languages_for_tesseract(languages) + == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" + ) + + +def test_prepare_languages_for_tesseract_with_multiple_languages(): + languages = ["ja", "afr", "en", "equ"] + assert lang.prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ" + + +def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog): + languages = ["zzz", "chi"] + assert ( + lang.prepare_languages_for_tesseract(languages) + == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" + ) + assert "not a valid standard language code" in caplog.text + + +def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog): + languages = ["kbd", "eng"] + assert lang.prepare_languages_for_tesseract(languages) == "eng" + assert "not a language supported by Tesseract" in caplog.text diff --git a/unstructured/partition/lang.py b/unstructured/partition/lang.py index e89ccec26..91915efad 100644 --- a/unstructured/partition/lang.py +++ b/unstructured/partition/lang.py @@ -1,13 +1,149 @@ from typing import List +import iso639 + +from unstructured.logger import logger + +# pytesseract.get_languages(config="") only shows user installed language packs, +# so manually include the list of all currently supported Tesseract languages +PYTESSERACT_LANGS = [ + "afr", + "amh", + "ara", + "asm", + "aze", + "aze_cyrl", + "bel", + "ben", + "bod", + "bos", + "bre", + "bul", + "cat", + "ceb", + "ces", + "chi_sim", + "chi_sim_vert", + "chi_tra", + "chi_tra_vert", + "chr", + "cos", + "cym", + "dan", + "deu", + "div", + "dzo", + "ell", + "eng", + "enm", + "epo", + "equ", + "est", + "eus", + "fao", + "fas", + "fil", + "fin", + "fra", + "frk", + "frm", + "fry", + "gla", + "gle", + "glg", + "grc", + "guj", + "hat", + "heb", + "hin", + "hrv", + "hun", + "hye", + "iku", + "ind", + "isl", + "ita", + "ita_old", + "jav", + "jpn", + "jpn_vert", + "kan", + "kat", + "kat_old", + "kaz", + "khm", + "kir", + "kmr", + "kor", + "kor_vert", + "lao", + "lat", + "lav", + "lit", + "ltz", + "mal", + "mar", + "mkd", + "mlt", + "mon", + "mri", + "msa", + "mya", + "nep", + "nld", + "nor", + "oci", + "ori", + "osd", + "pan", + "pol", + "por", + "pus", + "que", + "ron", + "rus", + "san", + "sin", + "slk", + "slv", + "snd", + "snum", + "spa", + "spa_old", + "sqi", + "srp", + "srp_latn", + "sun", + "swa", + "swe", + "syr", + "tam", + "tat", + "tel", + "tgk", + "tha", + "tir", + "ton", + "tur", + "uig", + "ukr", + "urd", + "uzb", + "uzb_cyrl", + "vie", + "yid", + "yor", +] + def prepare_languages_for_tesseract(languages: List[str] = ["eng"]): """ - Convert the languages param (list of strings) into tesseract ocr langcode format (uses +) string + Entry point: convert languages (list of strings) into tesseract ocr langcode format (uses +) """ - # NOTE(Shreya): assumes language codes are already in tesseract format (will be updated later) - - return "+".join(languages) + converted_languages = list( + filter(None, [convert_language_to_tesseract(lang) for lang in languages]), + ) + return "+".join(converted_languages) def convert_old_ocr_languages_to_languages(ocr_languages: str): @@ -17,3 +153,51 @@ def convert_old_ocr_languages_to_languages(ocr_languages: str): """ return ocr_languages.split("+") + + +def convert_language_to_tesseract(lang: str) -> str: + """ + Convert a language code to its tesseract formatted and recognized langcode(s), if supported. + """ + # if language is already tesseract langcode, return it immediately + # this will catch the tesseract special cases equ and osd + # NOTE(shreya): this may catch some cases of choosing between tesseract code variants for a lang + if lang in PYTESSERACT_LANGS: + return lang + + # get iso639 language object + try: + lang_iso639 = iso639.Language.match(lang.lower()) + except iso639.LanguageNotFoundError: + logger.warning(f"{lang} is not a valid standard language code.") + return "" + + # tesseract uses 3 digit codes (639-3, 639-2b, etc) as prefixes, with suffixes for orthography + # use first 3 letters of tesseract codes for matching to standard codes + pytesseract_langs_3 = {lang[:3] for lang in PYTESSERACT_LANGS} + + # try to match ISO 639-3 code + if lang_iso639.part3 in pytesseract_langs_3: + matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part3) + return "+".join(matched_langcodes) + + # try to match ISO 639-2b + elif lang_iso639.part2b in pytesseract_langs_3: + matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part2b) + return "+".join(matched_langcodes) + + # try to match ISO 639-2t + elif lang_iso639.part2t in pytesseract_langs_3: + matched_langcodes = _get_all_tesseract_langcodes_with_prefix(lang_iso639.part2t) + return "+".join(matched_langcodes) + + else: + logger.warning(f"{lang} is not a language supported by Tesseract.") + return "" + + +def _get_all_tesseract_langcodes_with_prefix(prefix: str): + """ + Get all matching tesseract langcodes with this prefix (may be one or multiple variants). + """ + return [langcode for langcode in PYTESSERACT_LANGS if langcode.startswith(prefix)]