enhancement: add _clean_ocr_languages_arg helper function (#2413)

This PR is one in a series of PRs for refactoring and fixing the
languages parameter so it can address incorrect input by users. #2293

This PR adds _clean_ocr_languages_arg. There are no calls to this
function yet, but it will be called in later PRs related to this series.
This commit is contained in:
John 2024-01-19 13:59:08 -06:00 committed by GitHub
parent c81d4e34be
commit c34fac9c3a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 43 additions and 2 deletions

View File

@ -1,3 +1,11 @@
## 0.12.2-dev0
### Enhancements
### Features
### Fixes
## 0.12.1
### Enhancements

View File

@ -5,6 +5,7 @@ from unstructured.documents.elements import (
PageBreak,
)
from unstructured.partition.lang import (
_clean_ocr_languages_arg,
_convert_language_code_to_pytesseract_lang_code,
apply_lang_metadata,
detect_languages,
@ -126,6 +127,22 @@ def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang):
assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in)
@pytest.mark.parametrize(
("input_ocr_langs", "expected"),
[
(["eng"], "eng"), # list
('"deu"', "deu"), # extra quotation marks
("[deu]", "deu"), # brackets
("['deu']", "deu"), # brackets and quotation marks
(["[deu]"], "deu"), # list, brackets and quotation marks
(['"deu"'], "deu"), # list and quotation marks
("deu+spa", "deu+spa"), # correct input
],
)
def test_clean_ocr_languages_arg(input_ocr_langs, expected):
assert _clean_ocr_languages_arg(input_ocr_langs) == expected
def test_detect_languages_handles_spelled_out_languages():
languages = detect_languages(text="Sample text longer than 5 words.", languages=["Spanish"])
assert languages == ["spa"]

View File

@ -1 +1 @@
__version__ = "0.12.1" # pragma: no cover
__version__ = "0.12.2-dev0" # pragma: no cover

View File

@ -1,5 +1,5 @@
import re
from typing import Iterable, Iterator, List, Optional
from typing import Iterable, Iterator, List, Optional, Union
import iso639
from langdetect import DetectorFactory, detect_langs, lang_detect_exception
@ -380,3 +380,19 @@ def apply_lang_metadata(
yield e
else:
yield e
def _clean_ocr_languages_arg(ocr_languages: Union[List[str], str]) -> str:
"""Fix common incorrect definitions for ocr_languages:
defining it as a list, adding extra quotation marks, adding brackets.
Returns a single string of ocr_languages"""
# extract from list
if isinstance(ocr_languages, list):
ocr_languages = "+".join(ocr_languages)
# remove extra quotations
ocr_languages = re.sub(r"[\"']", "", ocr_languages)
# remove brackets
ocr_languages = re.sub(r"[\[\]]", "", ocr_languages)
return ocr_languages