chore: refactor languages parameter for text_type functions (#1399)

### Summary
In order to support language functionality other than Tesseract OCR, we
want to represent languages provided for either partitioning accuracy or
OCR as a standard list of langcodes as strings. To identify element
types such as NarrativeText and Title, continue the refactor into
functions that use language checks to determine those potential
classifications.

### Details
Replaces `language` with `languages` (a list of strings) as a parameter
to `is_possible_narrative_text` and `is_possible_title`.


### Test
Call `is_possible_narrative_text` and `is_possible_title` with text in a
variety of languages and different inputs for `languages`. The resulting
element classifications should be no different from the current outputs.

ex: see `test_text_type_handles_multi_language_examples` in
`test_unstructured/partition/test_text_type.py`.
This commit is contained in:
shreyanid 2023-09-13 15:46:36 -04:00 committed by GitHub
parent 1b7c99d878
commit d87c83d7b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 26 deletions

View File

@ -1,4 +1,4 @@
## 0.10.15-dev8
## 0.10.15-dev9
### Enhancements
@ -9,6 +9,8 @@
* Adds `xlsx` and `xls` to `skip_infer_table_types` default list in `partition`
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in image partitioning functions
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in auto partition
* Replaces `language` with `languages` as an input parameter to unstructured-partition-text_type functions
* Removes `UNSTRUCTURED_LANGUAGE` env var. To skip English specific checks, set the `languages` parameter to non-English language(s).
### Features

View File

@ -67,25 +67,32 @@ def test_text_type_handles_non_english_examples(monkeypatch):
narrative_text = "Я говорю по-русски. Вы тоже?"
title = "Риски"
assert text_type.is_possible_narrative_text(narrative_text, language="en") is False
assert text_type.is_possible_narrative_text(narrative_text, language="") is True
assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False
assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True
assert text_type.is_possible_narrative_text(title, language="en") is False
assert text_type.is_possible_narrative_text(title, language="") is False
assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False
assert text_type.is_possible_narrative_text(title, languages=[]) is False
assert text_type.is_possible_title(title, language="en") is False
assert text_type.is_possible_title(title, language="") is True
assert text_type.is_possible_title(title, languages=["eng"]) is False
assert text_type.is_possible_title(title, languages=[]) is True
def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
monkeypatch.setenv("UNSTRUCTURED_LANGUAGE", "")
def test_text_type_handles_multi_language_examples(monkeypatch):
monkeypatch.setenv("UNSTRUCTURED_LANGUAGE_CHECKS", "true")
narrative_text = "Я говорю по-русски. Вы тоже? 不,我不会说俄语。"
title = "Риски (Riesgos)"
narrative_text = "Я говорю по-русски. Вы тоже?"
title = "Риски"
assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False
assert text_type.is_possible_narrative_text(narrative_text, languages=["chi", "rus"]) is True
assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True
assert text_type.is_possible_narrative_text(narrative_text) is True
assert text_type.is_possible_narrative_text(title) is False
assert text_type.is_possible_title(title) is True
assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False
assert text_type.is_possible_narrative_text(title, languages=["spa", "rus"]) is False
assert text_type.is_possible_narrative_text(title, languages=[]) is False
assert text_type.is_possible_title(title, languages=["eng"]) is False
assert text_type.is_possible_title(title, languages=["spa", "rus"]) is True
assert text_type.is_possible_title(title, languages=[]) is True
@pytest.mark.parametrize(

View File

@ -1 +1 @@
__version__ = "0.10.15-dev8" # pragma: no cover
__version__ = "0.10.15-dev9" # pragma: no cover

View File

@ -30,7 +30,7 @@ def is_possible_narrative_text(
text: str,
cap_threshold: float = 0.5,
non_alpha_threshold: float = 0.5,
language: str = "en",
languages: List[str] = ["eng"],
language_checks: bool = False,
) -> bool:
"""Checks to see if the text passes all of the checks for a narrative text section.
@ -47,8 +47,8 @@ def is_possible_narrative_text(
non_alpha_threshold
The minimum proportion of alpha characters the text needs to be considered
narrative text
language
The two letter language code for the text. defaults to "en" for English
languages
The list of languages present in the document. Defaults to ["eng"] for English
language_checks
If True, conducts checks that are specific to the chosen language. Turn on for more
accurate partitioning and off for faster processing.
@ -65,8 +65,7 @@ def is_possible_narrative_text(
trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}") # type: ignore
return False
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
if language == "en" and language_checks and not contains_english_word(text):
if "eng" in languages and language_checks and not contains_english_word(text):
return False
# NOTE(robinson): it gets read in from the environment as a string so we need to
@ -84,7 +83,7 @@ def is_possible_narrative_text(
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
return False
if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
if "eng" in languages and (sentence_count(text, 3) < 2) and (not contains_verb(text)):
trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}") # type: ignore # noqa: E501
return False
@ -96,7 +95,7 @@ def is_possible_title(
sentence_min_length: int = 5,
title_max_word_length: int = 12,
non_alpha_threshold: float = 0.5,
language: str = "en",
languages: List[str] = ["eng"],
language_checks: bool = False,
) -> bool:
"""Checks to see if the text passes all of the checks for a valid title.
@ -111,8 +110,8 @@ def is_possible_title(
The maximum number of words a title can contain
non_alpha_threshold
The minimum number of alpha characters the text needs to be considered a title
language
The two letter language code for the text. defaults to "en" for English
languages
The list of languages present in the document. Defaults to ["eng"] for English
language_checks
If True, conducts checks that are specific to the chosen language. Turn on for more
accurate partitioning and off for faster processing.
@ -146,8 +145,7 @@ def is_possible_title(
if text.endswith(","):
return False
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
if language == "en" and not contains_english_word(text) and language_checks:
if "eng" in languages and not contains_english_word(text) and language_checks:
return False
if text.isnumeric():