mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-03 05:39:47 +00:00
chore: refactor languages parameter for text_type functions (#1399)
### Summary In order to support language functionality other than Tesseract OCR, we want to represent languages provided for either partitioning accuracy or OCR as a standard list of langcodes as strings. To identify element types such as NarrativeText and Title, continue the refactor into functions that use language checks to determine those potential classifications. ### Details Replaces `language` with `languages` (a list of strings) as a parameter to `is_possible_narrative_text` and `is_possible_title`. ### Test Call `is_possible_narrative_text` and `is_possible_title` with text in a variety of languages and different inputs for `languages`. The resulting element classifications should be no different from the current outputs. ex: see `test_text_type_handles_multi_language_examples` in `test_unstructured/partition/test_text_type.py`.
This commit is contained in:
parent
1b7c99d878
commit
d87c83d7b6
@ -1,4 +1,4 @@
|
|||||||
## 0.10.15-dev8
|
## 0.10.15-dev9
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -9,6 +9,8 @@
|
|||||||
* Adds `xlsx` and `xls` to `skip_infer_table_types` default list in `partition`
|
* Adds `xlsx` and `xls` to `skip_infer_table_types` default list in `partition`
|
||||||
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in image partitioning functions
|
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in image partitioning functions
|
||||||
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in auto partition
|
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in auto partition
|
||||||
|
* Replaces `language` with `languages` as an input parameter to unstructured-partition-text_type functions
|
||||||
|
* Removes `UNSTRUCTURED_LANGUAGE` env var. To skip English specific checks, set the `languages` parameter to non-English language(s).
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
@ -67,25 +67,32 @@ def test_text_type_handles_non_english_examples(monkeypatch):
|
|||||||
narrative_text = "Я говорю по-русски. Вы тоже?"
|
narrative_text = "Я говорю по-русски. Вы тоже?"
|
||||||
title = "Риски"
|
title = "Риски"
|
||||||
|
|
||||||
assert text_type.is_possible_narrative_text(narrative_text, language="en") is False
|
assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False
|
||||||
assert text_type.is_possible_narrative_text(narrative_text, language="") is True
|
assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True
|
||||||
|
|
||||||
assert text_type.is_possible_narrative_text(title, language="en") is False
|
assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False
|
||||||
assert text_type.is_possible_narrative_text(title, language="") is False
|
assert text_type.is_possible_narrative_text(title, languages=[]) is False
|
||||||
|
|
||||||
assert text_type.is_possible_title(title, language="en") is False
|
assert text_type.is_possible_title(title, languages=["eng"]) is False
|
||||||
assert text_type.is_possible_title(title, language="") is True
|
assert text_type.is_possible_title(title, languages=[]) is True
|
||||||
|
|
||||||
|
|
||||||
def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
|
def test_text_type_handles_multi_language_examples(monkeypatch):
|
||||||
monkeypatch.setenv("UNSTRUCTURED_LANGUAGE", "")
|
monkeypatch.setenv("UNSTRUCTURED_LANGUAGE_CHECKS", "true")
|
||||||
|
narrative_text = "Я говорю по-русски. Вы тоже? 不,我不会说俄语。"
|
||||||
|
title = "Риски (Riesgos)"
|
||||||
|
|
||||||
narrative_text = "Я говорю по-русски. Вы тоже?"
|
assert text_type.is_possible_narrative_text(narrative_text, languages=["eng"]) is False
|
||||||
title = "Риски"
|
assert text_type.is_possible_narrative_text(narrative_text, languages=["chi", "rus"]) is True
|
||||||
|
assert text_type.is_possible_narrative_text(narrative_text, languages=[]) is True
|
||||||
|
|
||||||
assert text_type.is_possible_narrative_text(narrative_text) is True
|
assert text_type.is_possible_narrative_text(title, languages=["eng"]) is False
|
||||||
assert text_type.is_possible_narrative_text(title) is False
|
assert text_type.is_possible_narrative_text(title, languages=["spa", "rus"]) is False
|
||||||
assert text_type.is_possible_title(title) is True
|
assert text_type.is_possible_narrative_text(title, languages=[]) is False
|
||||||
|
|
||||||
|
assert text_type.is_possible_title(title, languages=["eng"]) is False
|
||||||
|
assert text_type.is_possible_title(title, languages=["spa", "rus"]) is True
|
||||||
|
assert text_type.is_possible_title(title, languages=[]) is True
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.10.15-dev8" # pragma: no cover
|
__version__ = "0.10.15-dev9" # pragma: no cover
|
||||||
|
@ -30,7 +30,7 @@ def is_possible_narrative_text(
|
|||||||
text: str,
|
text: str,
|
||||||
cap_threshold: float = 0.5,
|
cap_threshold: float = 0.5,
|
||||||
non_alpha_threshold: float = 0.5,
|
non_alpha_threshold: float = 0.5,
|
||||||
language: str = "en",
|
languages: List[str] = ["eng"],
|
||||||
language_checks: bool = False,
|
language_checks: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Checks to see if the text passes all of the checks for a narrative text section.
|
"""Checks to see if the text passes all of the checks for a narrative text section.
|
||||||
@ -47,8 +47,8 @@ def is_possible_narrative_text(
|
|||||||
non_alpha_threshold
|
non_alpha_threshold
|
||||||
The minimum proportion of alpha characters the text needs to be considered
|
The minimum proportion of alpha characters the text needs to be considered
|
||||||
narrative text
|
narrative text
|
||||||
language
|
languages
|
||||||
The two letter language code for the text. defaults to "en" for English
|
The list of languages present in the document. Defaults to ["eng"] for English
|
||||||
language_checks
|
language_checks
|
||||||
If True, conducts checks that are specific to the chosen language. Turn on for more
|
If True, conducts checks that are specific to the chosen language. Turn on for more
|
||||||
accurate partitioning and off for faster processing.
|
accurate partitioning and off for faster processing.
|
||||||
@ -65,8 +65,7 @@ def is_possible_narrative_text(
|
|||||||
trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}") # type: ignore
|
trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}") # type: ignore
|
||||||
return False
|
return False
|
||||||
|
|
||||||
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
|
if "eng" in languages and language_checks and not contains_english_word(text):
|
||||||
if language == "en" and language_checks and not contains_english_word(text):
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# NOTE(robinson): it gets read in from the environment as a string so we need to
|
# NOTE(robinson): it gets read in from the environment as a string so we need to
|
||||||
@ -84,7 +83,7 @@ def is_possible_narrative_text(
|
|||||||
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
|
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
|
if "eng" in languages and (sentence_count(text, 3) < 2) and (not contains_verb(text)):
|
||||||
trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}") # type: ignore # noqa: E501
|
trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}") # type: ignore # noqa: E501
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -96,7 +95,7 @@ def is_possible_title(
|
|||||||
sentence_min_length: int = 5,
|
sentence_min_length: int = 5,
|
||||||
title_max_word_length: int = 12,
|
title_max_word_length: int = 12,
|
||||||
non_alpha_threshold: float = 0.5,
|
non_alpha_threshold: float = 0.5,
|
||||||
language: str = "en",
|
languages: List[str] = ["eng"],
|
||||||
language_checks: bool = False,
|
language_checks: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Checks to see if the text passes all of the checks for a valid title.
|
"""Checks to see if the text passes all of the checks for a valid title.
|
||||||
@ -111,8 +110,8 @@ def is_possible_title(
|
|||||||
The maximum number of words a title can contain
|
The maximum number of words a title can contain
|
||||||
non_alpha_threshold
|
non_alpha_threshold
|
||||||
The minimum number of alpha characters the text needs to be considered a title
|
The minimum number of alpha characters the text needs to be considered a title
|
||||||
language
|
languages
|
||||||
The two letter language code for the text. defaults to "en" for English
|
The list of languages present in the document. Defaults to ["eng"] for English
|
||||||
language_checks
|
language_checks
|
||||||
If True, conducts checks that are specific to the chosen language. Turn on for more
|
If True, conducts checks that are specific to the chosen language. Turn on for more
|
||||||
accurate partitioning and off for faster processing.
|
accurate partitioning and off for faster processing.
|
||||||
@ -146,8 +145,7 @@ def is_possible_title(
|
|||||||
if text.endswith(","):
|
if text.endswith(","):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
|
if "eng" in languages and not contains_english_word(text) and language_checks:
|
||||||
if language == "en" and not contains_english_word(text) and language_checks:
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if text.isnumeric():
|
if text.isnumeric():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user