mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-31 20:33:52 +00:00
enhancement: add _clean_ocr_languages_arg helper function (#2413)
This PR is one in a series of PRs for refactoring and fixing the languages parameter so it can address incorrect input by users. #2293 This PR adds _clean_ocr_languages_arg. There are no calls to this function yet, but it will be called in later PRs related to this series.
This commit is contained in:
parent
c81d4e34be
commit
c34fac9c3a
@ -1,3 +1,11 @@
|
||||
## 0.12.2-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.12.1
|
||||
|
||||
### Enhancements
|
||||
|
@ -5,6 +5,7 @@ from unstructured.documents.elements import (
|
||||
PageBreak,
|
||||
)
|
||||
from unstructured.partition.lang import (
|
||||
_clean_ocr_languages_arg,
|
||||
_convert_language_code_to_pytesseract_lang_code,
|
||||
apply_lang_metadata,
|
||||
detect_languages,
|
||||
@ -126,6 +127,22 @@ def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang):
|
||||
assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("input_ocr_langs", "expected"),
|
||||
[
|
||||
(["eng"], "eng"), # list
|
||||
('"deu"', "deu"), # extra quotation marks
|
||||
("[deu]", "deu"), # brackets
|
||||
("['deu']", "deu"), # brackets and quotation marks
|
||||
(["[deu]"], "deu"), # list, brackets and quotation marks
|
||||
(['"deu"'], "deu"), # list and quotation marks
|
||||
("deu+spa", "deu+spa"), # correct input
|
||||
],
|
||||
)
|
||||
def test_clean_ocr_languages_arg(input_ocr_langs, expected):
|
||||
assert _clean_ocr_languages_arg(input_ocr_langs) == expected
|
||||
|
||||
|
||||
def test_detect_languages_handles_spelled_out_languages():
|
||||
languages = detect_languages(text="Sample text longer than 5 words.", languages=["Spanish"])
|
||||
assert languages == ["spa"]
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.12.1" # pragma: no cover
|
||||
__version__ = "0.12.2-dev0" # pragma: no cover
|
||||
|
@ -1,5 +1,5 @@
|
||||
import re
|
||||
from typing import Iterable, Iterator, List, Optional
|
||||
from typing import Iterable, Iterator, List, Optional, Union
|
||||
|
||||
import iso639
|
||||
from langdetect import DetectorFactory, detect_langs, lang_detect_exception
|
||||
@ -380,3 +380,19 @@ def apply_lang_metadata(
|
||||
yield e
|
||||
else:
|
||||
yield e
|
||||
|
||||
|
||||
def _clean_ocr_languages_arg(ocr_languages: Union[List[str], str]) -> str:
|
||||
"""Fix common incorrect definitions for ocr_languages:
|
||||
defining it as a list, adding extra quotation marks, adding brackets.
|
||||
Returns a single string of ocr_languages"""
|
||||
# extract from list
|
||||
if isinstance(ocr_languages, list):
|
||||
ocr_languages = "+".join(ocr_languages)
|
||||
|
||||
# remove extra quotations
|
||||
ocr_languages = re.sub(r"[\"']", "", ocr_languages)
|
||||
# remove brackets
|
||||
ocr_languages = re.sub(r"[\[\]]", "", ocr_languages)
|
||||
|
||||
return ocr_languages
|
||||
|
Loading…
x
Reference in New Issue
Block a user