mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-16 05:25:55 +00:00

### Summary In order to convert between incompatible language codes from packages used for OCR, this change adds a function to map between any standard language codes and tesseract OCR specific codes. Users can input language information to `languages` in any Tesseract-supported langcode or any ISO 639 standard language code. ### Details - Introduces the [python-iso639](https://pypi.org/project/python-iso639/) package for matching standard language codes. Recompiles all dependencies. - If a language is not already supplied by the user as a Tesseract specific langcode, supplies all possible script/orthography variants of the language to the Tesseract OCR agent. ### Test Added many unit tests for a variety of language combinations, special cases, and variants. For general testing, call partition functions with any lang codes in the languages parameter (Tesseract or standard). for example, ``` from unstructured.partition.auto import partition elements = partition(filename="example-docs/layout-parser-paper.pdf", strategy="hi_res", languages=["en", "chi"]) print("\n\n".join([str(el) for el in elements])) ``` should supply eng+chi_sim+chi_sim_vert+chi_tra+chi_tra_vert to Tesseract
48 lines
1.5 KiB
Python
48 lines
1.5 KiB
Python
from unstructured.partition import lang
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_with_one_language():
|
|
languages = ["en"]
|
|
assert lang.prepare_languages_for_tesseract(languages) == "eng"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_special_case():
|
|
languages = ["osd"]
|
|
assert lang.prepare_languages_for_tesseract(languages) == "osd"
|
|
|
|
languages = ["equ"]
|
|
assert lang.prepare_languages_for_tesseract(languages) == "equ"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_removes_empty_inputs():
|
|
languages = ["kbd", "es"]
|
|
assert lang.prepare_languages_for_tesseract(languages) == "spa+spa_old"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_includes_variants():
|
|
languages = ["chi"]
|
|
assert (
|
|
lang.prepare_languages_for_tesseract(languages)
|
|
== "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
|
)
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_with_multiple_languages():
|
|
languages = ["ja", "afr", "en", "equ"]
|
|
assert lang.prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog):
|
|
languages = ["zzz", "chi"]
|
|
assert (
|
|
lang.prepare_languages_for_tesseract(languages)
|
|
== "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
|
)
|
|
assert "not a valid standard language code" in caplog.text
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog):
|
|
languages = ["kbd", "eng"]
|
|
assert lang.prepare_languages_for_tesseract(languages) == "eng"
|
|
assert "not a language supported by Tesseract" in caplog.text
|