mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-29 08:05:08 +00:00
feat: ability to skip English language specific checks with env var (#224)
* add language env var * update docs * version and bump change log
This commit is contained in:
parent
a68dc35940
commit
558ee63e90
@ -1,7 +1,8 @@
|
||||
## 0.4.9-dev1
|
||||
## 0.4.9-dev2
|
||||
|
||||
* Added ingest modules and s3 connector
|
||||
* Default to `url=None` for `partition_pdf` and `partition_image`
|
||||
* Add ability to skip English specific check by setting the `UNSTRUCTURED_LANGUAGE` env var to `""`.
|
||||
|
||||
## 0.4.8
|
||||
|
||||
|
||||
@ -280,6 +280,7 @@ for consideration as narrative text. The function performs the following checks
|
||||
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
|
||||
The environment variables takes precedence over the kwarg.
|
||||
* The cap ratio test does not apply to text that is all uppercase.
|
||||
* If you use the ``language=""`` kwarg or set the ``UNSTRUCTURED_LANGUAGE`` environment variable to ``""``, the function will skip the verb check and the English word check.
|
||||
|
||||
|
||||
Examples:
|
||||
@ -320,6 +321,8 @@ for consideration as a title. The function performs the following checks:
|
||||
* Narrative text must contain at least one English word (if ``language`` is set to "en")
|
||||
* If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5.
|
||||
* If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles.
|
||||
* If you use the ``language=""`` kwarg or set the ``UNSTRUCTURED_LANGUAGE`` environment variable to ``""``, the function will skip the English word check.
|
||||
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
@ -50,6 +50,31 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
|
||||
assert is_possible_narrative is expected
|
||||
|
||||
|
||||
def test_text_type_handles_non_english_examples():
|
||||
narrative_text = "Я говорю по-русски. Вы тоже?"
|
||||
title = "Риски"
|
||||
|
||||
assert text_type.is_possible_narrative_text(narrative_text, language="en") is False
|
||||
assert text_type.is_possible_narrative_text(narrative_text, language="") is True
|
||||
|
||||
assert text_type.is_possible_narrative_text(title, language="en") is False
|
||||
assert text_type.is_possible_narrative_text(title, language="") is False
|
||||
|
||||
assert text_type.is_possible_title(title, language="en") is False
|
||||
assert text_type.is_possible_title(title, language="") is True
|
||||
|
||||
|
||||
def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
|
||||
monkeypatch.setenv("UNSTRUCTURED_LANGUAGE", "")
|
||||
|
||||
narrative_text = "Я говорю по-русски. Вы тоже?"
|
||||
title = "Риски"
|
||||
|
||||
assert text_type.is_possible_narrative_text(narrative_text) is True
|
||||
assert text_type.is_possible_narrative_text(title) is False
|
||||
assert text_type.is_possible_title(title) is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.9-dev1" # pragma: no cover
|
||||
__version__ = "0.4.9-dev2" # pragma: no cover
|
||||
|
||||
@ -49,6 +49,7 @@ def is_possible_narrative_text(
|
||||
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
|
||||
return False
|
||||
|
||||
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
|
||||
if language == "en" and not contains_english_word(text):
|
||||
return False
|
||||
|
||||
@ -67,7 +68,7 @@ def is_possible_narrative_text(
|
||||
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
|
||||
return False
|
||||
|
||||
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
|
||||
if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
|
||||
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
|
||||
return False
|
||||
|
||||
@ -118,6 +119,7 @@ def is_possible_title(
|
||||
if text.endswith(","):
|
||||
return False
|
||||
|
||||
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
|
||||
if language == "en" and not contains_english_word(text):
|
||||
return False
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user