enhancement: add _clean_ocr_languages_arg helper function (#2413)

This PR is one in a series of PRs for refactoring and fixing the languages parameter so it can address incorrect input by users. #2293 This PR adds _clean_ocr_languages_arg. There are no calls to this function yet, but it will be called in later PRs related to this series.
2025-10-16 18:44:58 +00:00 · 2024-01-19 13:59:08 -06:00 · 2024-01-19 13:59:08 -06:00 · c34fac9c3a
commit c34fac9c3a
parent c81d4e34be
4 changed files with 43 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,11 @@
+## 0.12.2-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
 ## 0.12.1

 ### Enhancements
--- a/test_unstructured/partition/test_lang.py
+++ b/test_unstructured/partition/test_lang.py
@ -5,6 +5,7 @@ from unstructured.documents.elements import (
    PageBreak,
 )
 from unstructured.partition.lang import (
+    _clean_ocr_languages_arg,
    _convert_language_code_to_pytesseract_lang_code,
    apply_lang_metadata,
    detect_languages,
@ -126,6 +127,22 @@ def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang):
    assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in)


+@pytest.mark.parametrize(
+    ("input_ocr_langs", "expected"),
+    [
+        (["eng"], "eng"),  # list
+        ('"deu"', "deu"),  # extra quotation marks
+        ("[deu]", "deu"),  # brackets
+        ("['deu']", "deu"),  # brackets and quotation marks
+        (["[deu]"], "deu"),  # list, brackets and quotation marks
+        (['"deu"'], "deu"),  # list and quotation marks
+        ("deu+spa", "deu+spa"),  # correct input
+    ],
+)
+def test_clean_ocr_languages_arg(input_ocr_langs, expected):
+    assert _clean_ocr_languages_arg(input_ocr_langs) == expected
+
+
 def test_detect_languages_handles_spelled_out_languages():
    languages = detect_languages(text="Sample text longer than 5 words.", languages=["Spanish"])
    assert languages == ["spa"]
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.12.1"  # pragma: no cover
+__version__ = "0.12.2-dev0"  # pragma: no cover
--- a/unstructured/partition/lang.py
+++ b/unstructured/partition/lang.py
@ -1,5 +1,5 @@
 import re
-from typing import Iterable, Iterator, List, Optional
+from typing import Iterable, Iterator, List, Optional, Union

 import iso639
 from langdetect import DetectorFactory, detect_langs, lang_detect_exception
@ -380,3 +380,19 @@ def apply_lang_metadata(
                yield e
            else:
                yield e
+
+
+def _clean_ocr_languages_arg(ocr_languages: Union[List[str], str]) -> str:
+    """Fix common incorrect definitions for ocr_languages:
+    defining it as a list, adding extra quotation marks, adding brackets.
+    Returns a single string of ocr_languages"""
+    # extract from list
+    if isinstance(ocr_languages, list):
+        ocr_languages = "+".join(ocr_languages)
+
+    # remove extra quotations
+    ocr_languages = re.sub(r"[\"']", "", ocr_languages)
+    # remove brackets
+    ocr_languages = re.sub(r"[\[\]]", "", ocr_languages)
+
+    return ocr_languages