diff --git a/CHANGELOG.md b/CHANGELOG.md index d0d6d98fc..72e9587fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.15-dev8 +## 0.10.15-dev9 ### Enhancements @@ -9,6 +9,8 @@ * Adds `xlsx` and `xls` to `skip_infer_table_types` default list in `partition` * Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in image partitioning functions * Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in auto partition +* Replaces `language` with `languages` as an input parameter to unstructured-partition-text_type functions +* Removes `UNSTRUCTURED_LANGUAGE` env var. To skip English specific checks, set the `languages` parameter to non-English language(s). ### Features diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py index d85b90fec..72561c0cb 100644 --- a/test_unstructured/partition/test_text_type.py +++ b/test_unstructured/partition/test_text_type.py @@ -67,25 +67,32 @@ def test_text_type_handles_non_english_examples(monkeypatch): narrative_text = "Я говорю по-русски. Вы тоже?" title = "Риски" - assert text_type.is_possible_narrative_text(narrative_text, language="en") is False - assert text_type.is_possible_narrative_text(narrative_text, language="") is True + assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False + assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True - assert text_type.is_possible_narrative_text(title, language="en") is False - assert text_type.is_possible_narrative_text(title, language="") is False + assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False + assert text_type.is_possible_narrative_text(title, languages=[]) is False - assert text_type.is_possible_title(title, language="en") is False - assert text_type.is_possible_title(title, language="") is True + assert text_type.is_possible_title(title, languages=["eng"]) is False + assert text_type.is_possible_title(title, languages=[]) is True -def test_text_type_handles_non_english_examples_with_env_var(monkeypatch): - monkeypatch.setenv("UNSTRUCTURED_LANGUAGE", "") +def test_text_type_handles_multi_language_examples(monkeypatch): + monkeypatch.setenv("UNSTRUCTURED_LANGUAGE_CHECKS", "true") + narrative_text = "Я говорю по-русски. Вы тоже? 不,我不会说俄语。" + title = "Риски (Riesgos)" - narrative_text = "Я говорю по-русски. Вы тоже?" - title = "Риски" + assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False + assert text_type.is_possible_narrative_text(narrative_text, languages=["chi", "rus"]) is True + assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True - assert text_type.is_possible_narrative_text(narrative_text) is True - assert text_type.is_possible_narrative_text(title) is False - assert text_type.is_possible_title(title) is True + assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False + assert text_type.is_possible_narrative_text(title, languages=["spa", "rus"]) is False + assert text_type.is_possible_narrative_text(title, languages=[]) is False + + assert text_type.is_possible_title(title, languages=["eng"]) is False + assert text_type.is_possible_title(title, languages=["spa", "rus"]) is True + assert text_type.is_possible_title(title, languages=[]) is True @pytest.mark.parametrize( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ce57c4d25..fb932abdf 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.15-dev8" # pragma: no cover +__version__ = "0.10.15-dev9" # pragma: no cover diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index 819b0898f..53dbd3162 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -30,7 +30,7 @@ def is_possible_narrative_text( text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.5, - language: str = "en", + languages: List[str] = ["eng"], language_checks: bool = False, ) -> bool: """Checks to see if the text passes all of the checks for a narrative text section. @@ -47,8 +47,8 @@ def is_possible_narrative_text( non_alpha_threshold The minimum proportion of alpha characters the text needs to be considered narrative text - language - The two letter language code for the text. defaults to "en" for English + languages + The list of languages present in the document. Defaults to ["eng"] for English language_checks If True, conducts checks that are specific to the chosen language. Turn on for more accurate partitioning and off for faster processing. @@ -65,8 +65,7 @@ def is_possible_narrative_text( trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}") # type: ignore return False - language = os.environ.get("UNSTRUCTURED_LANGUAGE", language) - if language == "en" and language_checks and not contains_english_word(text): + if "eng" in languages and language_checks and not contains_english_word(text): return False # NOTE(robinson): it gets read in from the environment as a string so we need to @@ -84,7 +83,7 @@ def is_possible_narrative_text( if under_non_alpha_ratio(text, threshold=non_alpha_threshold): return False - if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en": + if "eng" in languages and (sentence_count(text, 3) < 2) and (not contains_verb(text)): trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}") # type: ignore # noqa: E501 return False @@ -96,7 +95,7 @@ def is_possible_title( sentence_min_length: int = 5, title_max_word_length: int = 12, non_alpha_threshold: float = 0.5, - language: str = "en", + languages: List[str] = ["eng"], language_checks: bool = False, ) -> bool: """Checks to see if the text passes all of the checks for a valid title. @@ -111,8 +110,8 @@ def is_possible_title( The maximum number of words a title can contain non_alpha_threshold The minimum number of alpha characters the text needs to be considered a title - language - The two letter language code for the text. defaults to "en" for English + languages + The list of languages present in the document. Defaults to ["eng"] for English language_checks If True, conducts checks that are specific to the chosen language. Turn on for more accurate partitioning and off for faster processing. @@ -146,8 +145,7 @@ def is_possible_title( if text.endswith(","): return False - language = os.environ.get("UNSTRUCTURED_LANGUAGE", language) - if language == "en" and not contains_english_word(text) and language_checks: + if "eng" in languages and not contains_english_word(text) and language_checks: return False if text.isnumeric():