mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-31 12:23:49 +00:00
chore: refactor languages parameter for text_type functions (#1399)
### Summary In order to support language functionality other than Tesseract OCR, we want to represent languages provided for either partitioning accuracy or OCR as a standard list of langcodes as strings. To identify element types such as NarrativeText and Title, continue the refactor into functions that use language checks to determine those potential classifications. ### Details Replaces `language` with `languages` (a list of strings) as a parameter to `is_possible_narrative_text` and `is_possible_title`. ### Test Call `is_possible_narrative_text` and `is_possible_title` with text in a variety of languages and different inputs for `languages`. The resulting element classifications should be no different from the current outputs. ex: see `test_text_type_handles_multi_language_examples` in `test_unstructured/partition/test_text_type.py`.
This commit is contained in:
parent
1b7c99d878
commit
d87c83d7b6
@ -1,4 +1,4 @@
|
||||
## 0.10.15-dev8
|
||||
## 0.10.15-dev9
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -9,6 +9,8 @@
|
||||
* Adds `xlsx` and `xls` to `skip_infer_table_types` default list in `partition`
|
||||
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in image partitioning functions
|
||||
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in auto partition
|
||||
* Replaces `language` with `languages` as an input parameter to unstructured-partition-text_type functions
|
||||
* Removes `UNSTRUCTURED_LANGUAGE` env var. To skip English specific checks, set the `languages` parameter to non-English language(s).
|
||||
|
||||
### Features
|
||||
|
||||
|
@ -67,25 +67,32 @@ def test_text_type_handles_non_english_examples(monkeypatch):
|
||||
narrative_text = "Я говорю по-русски. Вы тоже?"
|
||||
title = "Риски"
|
||||
|
||||
assert text_type.is_possible_narrative_text(narrative_text, language="en") is False
|
||||
assert text_type.is_possible_narrative_text(narrative_text, language="") is True
|
||||
assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False
|
||||
assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True
|
||||
|
||||
assert text_type.is_possible_narrative_text(title, language="en") is False
|
||||
assert text_type.is_possible_narrative_text(title, language="") is False
|
||||
assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False
|
||||
assert text_type.is_possible_narrative_text(title, languages=[]) is False
|
||||
|
||||
assert text_type.is_possible_title(title, language="en") is False
|
||||
assert text_type.is_possible_title(title, language="") is True
|
||||
assert text_type.is_possible_title(title, languages=["eng"]) is False
|
||||
assert text_type.is_possible_title(title, languages=[]) is True
|
||||
|
||||
|
||||
def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
|
||||
monkeypatch.setenv("UNSTRUCTURED_LANGUAGE", "")
|
||||
def test_text_type_handles_multi_language_examples(monkeypatch):
|
||||
monkeypatch.setenv("UNSTRUCTURED_LANGUAGE_CHECKS", "true")
|
||||
narrative_text = "Я говорю по-русски. Вы тоже? 不,我不会说俄语。"
|
||||
title = "Риски (Riesgos)"
|
||||
|
||||
narrative_text = "Я говорю по-русски. Вы тоже?"
|
||||
title = "Риски"
|
||||
assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False
|
||||
assert text_type.is_possible_narrative_text(narrative_text, languages=["chi", "rus"]) is True
|
||||
assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True
|
||||
|
||||
assert text_type.is_possible_narrative_text(narrative_text) is True
|
||||
assert text_type.is_possible_narrative_text(title) is False
|
||||
assert text_type.is_possible_title(title) is True
|
||||
assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False
|
||||
assert text_type.is_possible_narrative_text(title, languages=["spa", "rus"]) is False
|
||||
assert text_type.is_possible_narrative_text(title, languages=[]) is False
|
||||
|
||||
assert text_type.is_possible_title(title, languages=["eng"]) is False
|
||||
assert text_type.is_possible_title(title, languages=["spa", "rus"]) is True
|
||||
assert text_type.is_possible_title(title, languages=[]) is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.10.15-dev8" # pragma: no cover
|
||||
__version__ = "0.10.15-dev9" # pragma: no cover
|
||||
|
@ -30,7 +30,7 @@ def is_possible_narrative_text(
|
||||
text: str,
|
||||
cap_threshold: float = 0.5,
|
||||
non_alpha_threshold: float = 0.5,
|
||||
language: str = "en",
|
||||
languages: List[str] = ["eng"],
|
||||
language_checks: bool = False,
|
||||
) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a narrative text section.
|
||||
@ -47,8 +47,8 @@ def is_possible_narrative_text(
|
||||
non_alpha_threshold
|
||||
The minimum proportion of alpha characters the text needs to be considered
|
||||
narrative text
|
||||
language
|
||||
The two letter language code for the text. defaults to "en" for English
|
||||
languages
|
||||
The list of languages present in the document. Defaults to ["eng"] for English
|
||||
language_checks
|
||||
If True, conducts checks that are specific to the chosen language. Turn on for more
|
||||
accurate partitioning and off for faster processing.
|
||||
@ -65,8 +65,7 @@ def is_possible_narrative_text(
|
||||
trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}") # type: ignore
|
||||
return False
|
||||
|
||||
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
|
||||
if language == "en" and language_checks and not contains_english_word(text):
|
||||
if "eng" in languages and language_checks and not contains_english_word(text):
|
||||
return False
|
||||
|
||||
# NOTE(robinson): it gets read in from the environment as a string so we need to
|
||||
@ -84,7 +83,7 @@ def is_possible_narrative_text(
|
||||
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
|
||||
return False
|
||||
|
||||
if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
|
||||
if "eng" in languages and (sentence_count(text, 3) < 2) and (not contains_verb(text)):
|
||||
trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}") # type: ignore # noqa: E501
|
||||
return False
|
||||
|
||||
@ -96,7 +95,7 @@ def is_possible_title(
|
||||
sentence_min_length: int = 5,
|
||||
title_max_word_length: int = 12,
|
||||
non_alpha_threshold: float = 0.5,
|
||||
language: str = "en",
|
||||
languages: List[str] = ["eng"],
|
||||
language_checks: bool = False,
|
||||
) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a valid title.
|
||||
@ -111,8 +110,8 @@ def is_possible_title(
|
||||
The maximum number of words a title can contain
|
||||
non_alpha_threshold
|
||||
The minimum number of alpha characters the text needs to be considered a title
|
||||
language
|
||||
The two letter language code for the text. defaults to "en" for English
|
||||
languages
|
||||
The list of languages present in the document. Defaults to ["eng"] for English
|
||||
language_checks
|
||||
If True, conducts checks that are specific to the chosen language. Turn on for more
|
||||
accurate partitioning and off for faster processing.
|
||||
@ -146,8 +145,7 @@ def is_possible_title(
|
||||
if text.endswith(","):
|
||||
return False
|
||||
|
||||
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
|
||||
if language == "en" and not contains_english_word(text) and language_checks:
|
||||
if "eng" in languages and not contains_english_word(text) and language_checks:
|
||||
return False
|
||||
|
||||
if text.isnumeric():
|
||||
|
Loading…
x
Reference in New Issue
Block a user