feat: ability to skip English language specific checks with env var (#224)

* add language env var

* update docs

* version and bump change log
This commit is contained in:
Matt Robinson 2023-02-15 09:15:47 -05:00 committed by GitHub
parent a68dc35940
commit 558ee63e90
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 34 additions and 3 deletions

View File

@ -1,7 +1,8 @@
## 0.4.9-dev1
## 0.4.9-dev2
* Added ingest modules and s3 connector
* Default to `url=None` for `partition_pdf` and `partition_image`
* Add ability to skip English specific check by setting the `UNSTRUCTURED_LANGUAGE` env var to `""`.
## 0.4.8

View File

@ -280,6 +280,7 @@ for consideration as narrative text. The function performs the following checks
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
The environment variables takes precedence over the kwarg.
* The cap ratio test does not apply to text that is all uppercase.
* If you use the ``language=""`` kwarg or set the ``UNSTRUCTURED_LANGUAGE`` environment variable to ``""``, the function will skip the verb check and the English word check.
Examples:
@ -320,6 +321,8 @@ for consideration as a title. The function performs the following checks:
* Narrative text must contain at least one English word (if ``language`` is set to "en")
* If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5.
* If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles.
* If you use the ``language=""`` kwarg or set the ``UNSTRUCTURED_LANGUAGE`` environment variable to ``""``, the function will skip the English word check.
Examples:

View File

@ -50,6 +50,31 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
assert is_possible_narrative is expected
def test_text_type_handles_non_english_examples():
narrative_text = "Я говорю по-русски. Вы тоже?"
title = "Риски"
assert text_type.is_possible_narrative_text(narrative_text, language="en") is False
assert text_type.is_possible_narrative_text(narrative_text, language="") is True
assert text_type.is_possible_narrative_text(title, language="en") is False
assert text_type.is_possible_narrative_text(title, language="") is False
assert text_type.is_possible_title(title, language="en") is False
assert text_type.is_possible_title(title, language="") is True
def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
monkeypatch.setenv("UNSTRUCTURED_LANGUAGE", "")
narrative_text = "Я говорю по-русски. Вы тоже?"
title = "Риски"
assert text_type.is_possible_narrative_text(narrative_text) is True
assert text_type.is_possible_narrative_text(title) is False
assert text_type.is_possible_title(title) is True
@pytest.mark.parametrize(
"text, expected",
[

View File

@ -1 +1 @@
__version__ = "0.4.9-dev1" # pragma: no cover
__version__ = "0.4.9-dev2" # pragma: no cover

View File

@ -49,6 +49,7 @@ def is_possible_narrative_text(
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
return False
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
if language == "en" and not contains_english_word(text):
return False
@ -67,7 +68,7 @@ def is_possible_narrative_text(
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
return False
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
return False
@ -118,6 +119,7 @@ def is_possible_title(
if text.endswith(","):
return False
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
if language == "en" and not contains_english_word(text):
return False