chore: refactor languages parameter for text_type functions (#1399)

### Summary In order to support language functionality other than Tesseract OCR, we want to represent languages provided for either partitioning accuracy or OCR as a standard list of langcodes as strings. To identify element types such as NarrativeText and Title, continue the refactor into functions that use language checks to determine those potential classifications. ### Details Replaces `language` with `languages` (a list of strings) as a parameter to `is_possible_narrative_text` and `is_possible_title`. ### Test Call `is_possible_narrative_text` and `is_possible_title` with text in a variety of languages and different inputs for `languages`. The resulting element classifications should be no different from the current outputs. ex: see `test_text_type_handles_multi_language_examples` in `test_unstructured/partition/test_text_type.py`.
2025-09-03 05:39:47 +00:00 · 2023-09-13 15:46:36 -04:00 · 2023-09-13 15:46:36 -04:00 · d87c83d7b6
commit d87c83d7b6
parent 1b7c99d878
4 changed files with 33 additions and 26 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.10.15-dev8
+## 0.10.15-dev9
 ### Enhancements
@ -9,6 +9,8 @@
 * Adds `xlsx` and `xls` to `skip_infer_table_types` default list in `partition`
 * Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in image partitioning functions
 * Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in auto partition
 * Replaces `language` with `languages` as an input parameter to unstructured-partition-text_type functions
 * Removes `UNSTRUCTURED_LANGUAGE` env var. To skip English specific checks, set the `languages` parameter to non-English language(s).
 ### Features
--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@ -67,25 +67,32 @@ def test_text_type_handles_non_english_examples(monkeypatch):
    narrative_text = "Я говорю по-русски. Вы тоже?"
    title = "Риски"
-    assert text_type.is_possible_narrative_text(narrative_text, language="en") is False
+    assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False
-    assert text_type.is_possible_narrative_text(narrative_text, language="") is True
+    assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True
-    assert text_type.is_possible_narrative_text(title, language="en") is False
+    assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False
-    assert text_type.is_possible_narrative_text(title, language="") is False
+    assert text_type.is_possible_narrative_text(title, languages=[]) is False
-    assert text_type.is_possible_title(title, language="en") is False
+    assert text_type.is_possible_title(title, languages=["eng"]) is False
-    assert text_type.is_possible_title(title, language="") is True
+    assert text_type.is_possible_title(title, languages=[]) is True
-def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
+def test_text_type_handles_multi_language_examples(monkeypatch):
-    monkeypatch.setenv("UNSTRUCTURED_LANGUAGE", "")
+    monkeypatch.setenv("UNSTRUCTURED_LANGUAGE_CHECKS", "true")
    narrative_text = "Я говорю по-русски. Вы тоже? 不，我不会说俄语。"
    title = "Риски (Riesgos)"
-    narrative_text = "Я говорю по-русски. Вы тоже?"
+    assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False
-    title = "Риски"
+    assert text_type.is_possible_narrative_text(narrative_text, languages=["chi", "rus"]) is True
    assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True
-    assert text_type.is_possible_narrative_text(narrative_text) is True
+    assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False
-    assert text_type.is_possible_narrative_text(title) is False
+    assert text_type.is_possible_narrative_text(title, languages=["spa", "rus"]) is False
-    assert text_type.is_possible_title(title) is True
+    assert text_type.is_possible_narrative_text(title, languages=[]) is False
    assert text_type.is_possible_title(title, languages=["eng"]) is False
    assert text_type.is_possible_title(title, languages=["spa", "rus"]) is True
    assert text_type.is_possible_title(title, languages=[]) is True
@pytest.mark.parametrize(
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.15-dev8"  # pragma: no cover
+__version__ = "0.10.15-dev9"  # pragma: no cover
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -30,7 +30,7 @@ def is_possible_narrative_text(
    text: str,
    cap_threshold: float = 0.5,
    non_alpha_threshold: float = 0.5,
-    language: str = "en",
+    languages: List[str] = ["eng"],
    language_checks: bool = False,
 ) -> bool:
    """Checks to see if the text passes all of the checks for a narrative text section.
@ -47,8 +47,8 @@ def is_possible_narrative_text(
    non_alpha_threshold
        The minimum proportion of alpha characters the text needs to be considered
        narrative text
-    language
+    languages
-        The two letter language code for the text. defaults to "en" for English
+        The list of languages present in the document. Defaults to ["eng"] for English
    language_checks
        If True, conducts checks that are specific to the chosen language. Turn on for more
        accurate partitioning and off for faster processing.
@ -65,8 +65,7 @@ def is_possible_narrative_text(
        trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}")  # type: ignore
        return False
-    language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
+    if "eng" in languages and language_checks and not contains_english_word(text):
    if language == "en" and language_checks and not contains_english_word(text):
        return False
    # NOTE(robinson): it gets read in from the environment as a string so we need to
@ -84,7 +83,7 @@ def is_possible_narrative_text(
    if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
        return False
-    if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
+    if "eng" in languages and (sentence_count(text, 3) < 2) and (not contains_verb(text)):
        trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}")  # type: ignore # noqa: E501
        return False
@ -96,7 +95,7 @@ def is_possible_title(
    sentence_min_length: int = 5,
    title_max_word_length: int = 12,
    non_alpha_threshold: float = 0.5,
-    language: str = "en",
+    languages: List[str] = ["eng"],
    language_checks: bool = False,
 ) -> bool:
    """Checks to see if the text passes all of the checks for a valid title.
@ -111,8 +110,8 @@ def is_possible_title(
        The maximum number of words a title can contain
    non_alpha_threshold
        The minimum number of alpha characters the text needs to be considered a title
-    language
+    languages
-        The two letter language code for the text. defaults to "en" for English
+        The list of languages present in the document. Defaults to ["eng"] for English
    language_checks
        If True, conducts checks that are specific to the chosen language. Turn on for more
        accurate partitioning and off for faster processing.
@ -146,8 +145,7 @@ def is_possible_title(
    if text.endswith(","):
        return False
-    language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
+    if "eng" in languages and not contains_english_word(text) and language_checks:
    if language == "en" and not contains_english_word(text) and language_checks:
        return False
    if text.isnumeric():
`@ -1 +1 @@`
	`__version__ = "0.10.15-dev8" # pragma: no cover`	`__version__ = "0.10.15-dev9" # pragma: no cover`