mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 23:20:35 +00:00

Summary: Close: https://github.com/Unstructured-IO/unstructured/issues/1920 * stop passing in empty string from `languages` to tesseract, which will result in passing empty string to language config `-l` for the tesseract CLI * also stop passing in duplicate language code from `languages` to tesseract OCR * if we failed to convert any iso languages from the `languages` parameter, proceed OCR with `eng` as default ### Test * First confirm the tesseract error `Estimating resolution as X` before this: * on the `unstructured-api` repo with main branch, run `make run-web-app` * curl to test error from empty string, or just any wrong input like `-F 'languages="eng,de"'`: ``` curl -X 'POST' 'http://0.0.0.0:8000/general/v0/general' \ -H 'accept: application/json' \ -H 'Content-Type: multipart/form-data' \ -F 'files=@sample-docs/layout-parser-paper-with-table.jpg' \ -F 'languages=""' \ -F 'strategy=hi_res' \ -F 'pdf_infer_table_structure=True' \ | jq -C . | less -R ``` * after this change: * in your unstructured API env, cd to unstructured repo and install it locally with `pip install -e .` * check out to this branch * run `make run-web-app` again in api repo * the curl command return output and see warning in log --------- Co-authored-by: qued <64741807+qued@users.noreply.github.com>
115 lines
3.7 KiB
Python
115 lines
3.7 KiB
Python
import pytest
|
|
|
|
from unstructured.documents.elements import (
|
|
NarrativeText,
|
|
PageBreak,
|
|
)
|
|
from unstructured.partition.lang import (
|
|
apply_lang_metadata,
|
|
detect_languages,
|
|
prepare_languages_for_tesseract,
|
|
)
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_with_one_language():
|
|
languages = ["en"]
|
|
assert prepare_languages_for_tesseract(languages) == "eng"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_with_duplicated_languages():
|
|
languages = ["en", "eng"]
|
|
assert prepare_languages_for_tesseract(languages) == "eng"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_special_case():
|
|
languages = ["osd"]
|
|
assert prepare_languages_for_tesseract(languages) == "osd"
|
|
|
|
languages = ["equ"]
|
|
assert prepare_languages_for_tesseract(languages) == "equ"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_removes_empty_inputs():
|
|
languages = ["kbd", "es"]
|
|
assert prepare_languages_for_tesseract(languages) == "spa+spa_old"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_includes_variants():
|
|
languages = ["chi"]
|
|
assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_with_multiple_languages():
|
|
languages = ["ja", "afr", "en", "equ"]
|
|
assert prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ"
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog):
|
|
languages = ["zzz", "chi"]
|
|
assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
|
assert "not a valid standard language code" in caplog.text
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog):
|
|
languages = ["kbd", "eng"]
|
|
assert prepare_languages_for_tesseract(languages) == "eng"
|
|
assert "not a language supported by Tesseract" in caplog.text
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_None_languages():
|
|
with pytest.raises(ValueError, match="`languages` can not be `None`"):
|
|
languages = None
|
|
prepare_languages_for_tesseract(languages)
|
|
|
|
|
|
def test_prepare_languages_for_tesseract_no_valid_languages(caplog):
|
|
languages = [""]
|
|
assert prepare_languages_for_tesseract(languages) == "eng"
|
|
assert "Failed to find any valid standard language code from languages" in caplog.text
|
|
|
|
|
|
def test_detect_languages_english_auto():
|
|
text = "This is a short sentence."
|
|
assert detect_languages(text) == ["eng"]
|
|
|
|
|
|
def test_detect_languages_english_provided():
|
|
text = "This is another short sentence."
|
|
languages = ["en"]
|
|
assert detect_languages(text, languages) == ["eng"]
|
|
|
|
|
|
def test_detect_languages_korean_auto():
|
|
text = "안녕하세요"
|
|
assert detect_languages(text) == ["kor"]
|
|
|
|
|
|
def test_detect_languages_gets_multiple_languages():
|
|
text = "My lubimy mleko i chleb."
|
|
assert detect_languages(text) == ["ces", "pol", "slk"]
|
|
|
|
|
|
def test_detect_languages_warns_for_auto_and_other_input(caplog):
|
|
text = "This is another short sentence."
|
|
languages = ["en", "auto", "rus"]
|
|
assert detect_languages(text, languages) == ["eng"]
|
|
assert "rest of the inputted languages will be ignored" in caplog.text
|
|
|
|
|
|
def test_detect_languages_raises_TypeError_for_invalid_languages():
|
|
with pytest.raises(TypeError):
|
|
text = "This is a short sentence."
|
|
detect_languages(text, languages="eng") == ["eng"]
|
|
|
|
|
|
def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog):
|
|
elements = [NarrativeText("Sample text."), PageBreak("")]
|
|
elements = list(
|
|
apply_lang_metadata(
|
|
elements=elements,
|
|
languages=["auto"],
|
|
detect_language_per_element=True,
|
|
),
|
|
)
|
|
assert "No features in text." not in [rec.message for rec in caplog.records]
|