mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-04 06:13:48 +00:00
enhancement: add _clean_ocr_languages_arg helper function (#2413)
This PR is one in a series of PRs for refactoring and fixing the languages parameter so it can address incorrect input by users. #2293 This PR adds _clean_ocr_languages_arg. There are no calls to this function yet, but it will be called in later PRs related to this series.
This commit is contained in:
parent
c81d4e34be
commit
c34fac9c3a
@ -1,3 +1,11 @@
|
|||||||
|
## 0.12.2-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.12.1
|
## 0.12.1
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -5,6 +5,7 @@ from unstructured.documents.elements import (
|
|||||||
PageBreak,
|
PageBreak,
|
||||||
)
|
)
|
||||||
from unstructured.partition.lang import (
|
from unstructured.partition.lang import (
|
||||||
|
_clean_ocr_languages_arg,
|
||||||
_convert_language_code_to_pytesseract_lang_code,
|
_convert_language_code_to_pytesseract_lang_code,
|
||||||
apply_lang_metadata,
|
apply_lang_metadata,
|
||||||
detect_languages,
|
detect_languages,
|
||||||
@ -126,6 +127,22 @@ def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang):
|
|||||||
assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in)
|
assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("input_ocr_langs", "expected"),
|
||||||
|
[
|
||||||
|
(["eng"], "eng"), # list
|
||||||
|
('"deu"', "deu"), # extra quotation marks
|
||||||
|
("[deu]", "deu"), # brackets
|
||||||
|
("['deu']", "deu"), # brackets and quotation marks
|
||||||
|
(["[deu]"], "deu"), # list, brackets and quotation marks
|
||||||
|
(['"deu"'], "deu"), # list and quotation marks
|
||||||
|
("deu+spa", "deu+spa"), # correct input
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_clean_ocr_languages_arg(input_ocr_langs, expected):
|
||||||
|
assert _clean_ocr_languages_arg(input_ocr_langs) == expected
|
||||||
|
|
||||||
|
|
||||||
def test_detect_languages_handles_spelled_out_languages():
|
def test_detect_languages_handles_spelled_out_languages():
|
||||||
languages = detect_languages(text="Sample text longer than 5 words.", languages=["Spanish"])
|
languages = detect_languages(text="Sample text longer than 5 words.", languages=["Spanish"])
|
||||||
assert languages == ["spa"]
|
assert languages == ["spa"]
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.12.1" # pragma: no cover
|
__version__ = "0.12.2-dev0" # pragma: no cover
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
from typing import Iterable, Iterator, List, Optional
|
from typing import Iterable, Iterator, List, Optional, Union
|
||||||
|
|
||||||
import iso639
|
import iso639
|
||||||
from langdetect import DetectorFactory, detect_langs, lang_detect_exception
|
from langdetect import DetectorFactory, detect_langs, lang_detect_exception
|
||||||
@ -380,3 +380,19 @@ def apply_lang_metadata(
|
|||||||
yield e
|
yield e
|
||||||
else:
|
else:
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_ocr_languages_arg(ocr_languages: Union[List[str], str]) -> str:
|
||||||
|
"""Fix common incorrect definitions for ocr_languages:
|
||||||
|
defining it as a list, adding extra quotation marks, adding brackets.
|
||||||
|
Returns a single string of ocr_languages"""
|
||||||
|
# extract from list
|
||||||
|
if isinstance(ocr_languages, list):
|
||||||
|
ocr_languages = "+".join(ocr_languages)
|
||||||
|
|
||||||
|
# remove extra quotations
|
||||||
|
ocr_languages = re.sub(r"[\"']", "", ocr_languages)
|
||||||
|
# remove brackets
|
||||||
|
ocr_languages = re.sub(r"[\[\]]", "", ocr_languages)
|
||||||
|
|
||||||
|
return ocr_languages
|
||||||
|
Loading…
x
Reference in New Issue
Block a user