diff --git a/CHANGELOG.md b/CHANGELOG.md index e90429573..f11a6770e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ * **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those files as text. * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector. +* **Handle common incorrect arguments for `languages` and `ocr_languages`** Users are regularly receiving errors on the API because they are defining `ocr_languages` or `languages` with additional quotationmarks, brackets, and similar mistakes. This update handles common incorrect arguments and raises an appropriate warning. * **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path. * **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency * **Update documentation.** (i) best practice for table extration by using 'skip_infer_table_types' param, instead of 'pdf_infer_table_structure', and (ii) fixed CSS, RST issues and typo in the documentation. diff --git a/docs/source/core/partition.rst b/docs/source/core/partition.rst index c39357b42..3c897822b 100644 --- a/docs/source/core/partition.rst +++ b/docs/source/core/partition.rst @@ -404,8 +404,8 @@ The ``partition_image`` function has the same API as ``partition_pdf``, which is The only difference is that ``partition_image`` does not need to convert a PDF to an image prior to processing. The ``partition_image`` function supports ``.png`` and ``.jpg`` files. -You can also specify what languages to use for OCR with the ``ocr_languages`` kwarg. For example, -use ``ocr_languages="eng+deu"`` to use the English and German language packs. See the +You can also specify what languages to use for OCR with the ``languages`` kwarg. For example, +use ``languages=["eng", "deu"]`` to use the English and German language packs. See the `Tesseract documentation `_ for a full list of languages and install instructions. @@ -420,7 +420,7 @@ Examples: elements = partition_image("example-docs/layout-parser-paper-fast.jpg") # Applies the English and Swedish language pack for ocr - elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe") + elements = partition_image("example-docs/layout-parser-paper-fast.jpg", languages=["eng", "swe"]) The ``strategy`` kwarg controls the method that will be used to process the PDF. @@ -449,7 +449,7 @@ have the Korean language pack for Tesseract installed on your system. from unstructured.partition.image import partition_image filename = "example-docs/english-and-korean.png" - elements = partition_image(filename=filename, ocr_languages="eng+kor", strategy="ocr_only") + elements = partition_image(filename=filename, languages=["eng", "kor"], strategy="ocr_only") For more information about the ``partition_image`` function, you can check the `source code here `__. @@ -604,8 +604,8 @@ If you set the URL, ``partition_pdf`` will make a call to a remote inference ser ``partition_pdf`` also includes a ``token`` function that allows you to pass in an authentication token for a remote API call. -You can also specify what languages to use for OCR with the ``ocr_languages`` kwarg. For example, -use ``ocr_languages="eng+deu"`` to use the English and German language packs. See the +You can also specify what languages to use for OCR with the ``languages`` kwarg. For example, +use ``languages=["eng", "deu"]`` to use the English and German language packs. See the `Tesseract documentation `_ for a full list of languages and install instructions. OCR is only applied if the text is not already available in the PDF document. @@ -620,7 +620,7 @@ Examples: # Applies the English and Swedish language pack for ocr. OCR is only applied # if the text is not available in the PDF. - elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", ocr_languages="eng+swe") + elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", languages=["eng", "swe"]) The ``strategy`` kwarg controls the method that will be used to process the PDF. @@ -859,7 +859,7 @@ type for the file. If you do not explicitly pass it, the MIME type will be infer elements = partition_via_api(file=f, metadata_filename=filename, api_key="MY_API_KEY") -You can pass additional settings such as ``strategy``, ``ocr_languages`` and ``encoding`` to the +You can pass additional settings such as ``strategy``, ``languages`` and ``encoding`` to the API through optional kwargs. These options get added to the request body when the API is called. See `the API documentation `_ for a full list of diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 73183477f..7ef71a872 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -745,14 +745,6 @@ def test_partition_pdf_warns_with_ocr_languages(caplog): assert "The ocr_languages kwarg will be deprecated" in caplog.text -def test_partition_pdf_or_image_warns_with_ocr_languages(caplog): - filename = example_doc_path("DA-1p.pdf") - pdf.partition_pdf_or_image( - filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng" - ) - assert "The ocr_languages kwarg will be deprecated" in caplog.text - - def test_partition_categorization_backup(): text = "This is Clearly a Title" with mock.patch.object(pdf, "_partition_pdf_or_image_local", return_value=[Text(text)]): diff --git a/test_unstructured/partition/test_lang.py b/test_unstructured/partition/test_lang.py index c262a7c7a..e8bb5fedf 100644 --- a/test_unstructured/partition/test_lang.py +++ b/test_unstructured/partition/test_lang.py @@ -1,3 +1,13 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for the `unstructured.partition.lang` module.""" + +from __future__ import annotations + +import os +import pathlib +from typing import Union + import pytest from unstructured.documents.elements import ( @@ -8,10 +18,14 @@ from unstructured.partition.lang import ( _clean_ocr_languages_arg, _convert_language_code_to_pytesseract_lang_code, apply_lang_metadata, + check_language_args, detect_languages, prepare_languages_for_tesseract, ) +DIRECTORY = pathlib.Path(__file__).parent.resolve() +EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs") + def test_prepare_languages_for_tesseract_with_one_language(): languages = ["en"] @@ -146,3 +160,87 @@ def test_clean_ocr_languages_arg(input_ocr_langs, expected): def test_detect_languages_handles_spelled_out_languages(): languages = detect_languages(text="Sample text longer than 5 words.", languages=["Spanish"]) assert languages == ["spa"] + + +@pytest.mark.parametrize( + ("languages", "ocr_languages", "expected_langs"), + [ + (["spa"], "deu", ["spa"]), + (["spanish"], "english", ["spa"]), + (["spa"], "[deu]", ["spa"]), + (["spa"], '"deu"', ["spa"]), + (["spa"], ["deu"], ["spa"]), + (["spa"], ["[deu]"], ["spa"]), + (["spa+deu"], "eng+deu", ["spa", "deu"]), + ], +) +def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_defined( + languages: Union[list[str], str], + ocr_languages: Union[list[str], str, None], + expected_langs: list[str], + caplog, +): + returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) + for lang in returned_langs: # type: ignore + assert lang in expected_langs + assert "ocr_languages" in caplog.text + + +@pytest.mark.parametrize( + ("languages", "ocr_languages", "expected_langs"), + [ + # raise warning and use `ocr_languages` when `languages` is empty or None + ([], "deu", ["deu"]), + ([""], '"deu"', ["deu"]), + ([""], "deu", ["deu"]), + ([""], "[deu]", ["deu"]), + ], +) +def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None( + languages: Union[list[str], str], + ocr_languages: Union[list[str], str, None], + expected_langs: list[str], + caplog, +): + returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) + for lang in returned_langs: # type: ignore + assert lang in expected_langs + assert "ocr_languages" in caplog.text + + +@pytest.mark.parametrize( + ("languages", "ocr_languages"), + [ + ([], None), # how check_language_args is called from auto.partition() + ([""], None), + ], +) +def test_check_language_args_returns_None( + languages: Union[list[str], str, None], + ocr_languages: Union[list[str], str, None], +): + returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) + assert returned_langs is None + + +def test_check_language_args_returns_auto( + languages=["eng", "spa", "auto"], + ocr_languages=None, +): + returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) + assert returned_langs == ["auto"] + + +@pytest.mark.parametrize( + ("languages", "ocr_languages"), + [ + ([], ["auto"]), + ([""], "eng+auto"), + ], +) +def test_check_language_args_raises_error_when_ocr_languages_contains_auto( + languages: Union[list[str], str, None], + ocr_languages: Union[list[str], str, None], +): + with pytest.raises(ValueError): + check_language_args(languages=languages, ocr_languages=ocr_languages) diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 7707232ca..446bd7c80 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -17,7 +17,7 @@ from unstructured.partition.email import partition_email from unstructured.partition.html import partition_html from unstructured.partition.json import partition_json from unstructured.partition.lang import ( - convert_old_ocr_languages_to_languages, + check_language_args, ) from unstructured.partition.text import partition_text from unstructured.partition.utils.constants import PartitionStrategy @@ -252,23 +252,7 @@ def partition( ) kwargs.setdefault("metadata_filename", metadata_filename) - if ocr_languages == "": - ocr_languages = None - - if ocr_languages is not None: - # check if languages was set to anything not the default value - # languages and ocr_languages were therefore both provided - raise error - if languages is not None: - raise ValueError( - "Only one of languages and ocr_languages should be specified. " - "languages is preferred. ocr_languages is marked for deprecation.", - ) - else: - languages = convert_old_ocr_languages_to_languages(ocr_languages) - logger.warning( - "The ocr_languages kwarg will be deprecated in a future version of unstructured. " - "Please use languages instead.", - ) + languages = check_language_args(languages or [], ocr_languages) if url is not None: file, filetype = file_and_type_from_url( diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 27e1fb03b..d94ead56c 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -3,10 +3,9 @@ from typing import List, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import add_metadata -from unstructured.logger import logger from unstructured.partition.common import exactly_one from unstructured.partition.lang import ( - convert_old_ocr_languages_to_languages, + check_language_args, ) from unstructured.partition.pdf import partition_pdf_or_image from unstructured.partition.utils.constants import PartitionStrategy @@ -21,7 +20,7 @@ def partition_image( include_page_breaks: bool = False, infer_table_structure: bool = False, ocr_languages: Optional[str] = None, - languages: Optional[List[str]] = ["eng"], + languages: Optional[List[str]] = None, strategy: str = PartitionStrategy.HI_RES, metadata_last_modified: Optional[str] = None, chunking_strategy: Optional[str] = None, @@ -87,27 +86,7 @@ def partition_image( """ exactly_one(filename=filename, file=file) - if languages is None: - languages = ["eng"] - - if not isinstance(languages, list): - raise TypeError( - 'The language parameter must be a list of language codes as strings, ex. ["eng"]', - ) - - if ocr_languages is not None: - if languages != ["eng"]: - raise ValueError( - "Only one of languages and ocr_languages should be specified. " - "languages is preferred. ocr_languages is marked for deprecation.", - ) - - else: - languages = convert_old_ocr_languages_to_languages(ocr_languages) - logger.warning( - "The ocr_languages kwarg will be deprecated in a future version of unstructured. " - "Please use languages instead.", - ) + languages = check_language_args(languages or [], ocr_languages) or ["eng"] return partition_pdf_or_image( filename=filename, diff --git a/unstructured/partition/lang.py b/unstructured/partition/lang.py index d2aa6a6a2..64228ba76 100644 --- a/unstructured/partition/lang.py +++ b/unstructured/partition/lang.py @@ -143,7 +143,7 @@ PYTESSERACT_LANG_CODES = [ ] -def prepare_languages_for_tesseract(languages: Optional[List[str]] = ["eng"]): +def prepare_languages_for_tesseract(languages: Optional[List[str]] = ["eng"]) -> str: """ Entry point: convert languages (list of strings) into tesseract ocr langcode format (uses +) """ @@ -167,34 +167,65 @@ def prepare_languages_for_tesseract(languages: Optional[List[str]] = ["eng"]): return TESSERACT_LANGUAGES_SPLITTER.join(converted_languages) -def check_languages(languages: Optional[List[str]], ocr_languages: Optional[str]): - """Handle `ocr_languages` and `languages`, defining `languages` to ['eng'] as default and - converting `ocr_languages` if needed""" - if languages is None: - languages = ["eng"] +def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> Optional[list[str]]: + """Handle users defining both `ocr_languages` and `languages`, giving preference to `languages` + and converting `ocr_languages` if needed, but defaulting to `None. + + `ocr_languages` is only a parameter for `auto.partition`, `partition_image`, & `partition_pdf`. + `ocr_languages` should not be defined as 'auto' since 'auto' is intended for language detection + which is not supported by `partition_image` or `partition_pdf`.""" + # --- Clean and update defaults + if ocr_languages: + ocr_languages = _clean_ocr_languages_arg(ocr_languages) + logger.warning( + "The ocr_languages kwarg will be deprecated in a future version of unstructured. " + "Please use languages instead.", + ) + + if ocr_languages and "auto" in ocr_languages: + raise ValueError( + "`ocr_languages` is deprecated but was used to extract text from pdfs and images." + " The 'auto' argument is only for language *detection* when it is assigned" + " to `languages` and partitioning documents other than pdfs or images." + " Language detection is not currently supported in pdfs or images." + ) if not isinstance(languages, list): raise TypeError( "The language parameter must be a list of language codes as strings, ex. ['eng']", ) - if ocr_languages is not None: - if languages != ["eng"]: - raise ValueError( - "Only one of languages and ocr_languages should be specified. " - "languages is preferred. ocr_languages is marked for deprecation.", - ) + # --- If `languages` is a null/default value and `ocr_languages` is defined, use `ocr_languages` + if ocr_languages and (languages == ["auto"] or languages == [""] or not languages): + languages = ocr_languages.split(TESSERACT_LANGUAGES_SPLITTER) + logger.warning( + "Only one of languages and ocr_languages should be specified. " + "languages is preferred. ocr_languages is marked for deprecation.", + ) + # --- Clean `languages` + # If "auto" is included in the list of inputs, language detection will be triggered downstream. + # The rest of the inputted languages are ignored. + if languages: + if "auto" not in languages: + for i, lang in enumerate(languages): + languages[i] = TESSERACT_LANGUAGES_AND_CODES.get(lang.lower(), lang) + + str_languages = _clean_ocr_languages_arg(languages) + if not str_languages: + return None + languages = str_languages.split(TESSERACT_LANGUAGES_SPLITTER) + # else, remove the extraneous languages. + # NOTE (jennings): "auto" should only be used for partitioners OTHER THAN `_pdf` or `_image` else: - languages = convert_old_ocr_languages_to_languages(ocr_languages) - logger.warning( - "The ocr_languages kwarg will be deprecated in a future version of unstructured. " - "Please use languages instead.", - ) - return languages + # define as 'auto' for language detection when partitioning non-pdfs or -images + languages = ["auto"] + return languages + + return None -def convert_old_ocr_languages_to_languages(ocr_languages: str): +def convert_old_ocr_languages_to_languages(ocr_languages: str) -> list[str]: """ Convert ocr_languages parameter to list of langcode strings. Assumption: ocr_languages is in tesseract plus sign format @@ -251,7 +282,7 @@ def _get_iso639_language_object(lang: str) -> Optional[iso639.Language]: return None -def _get_all_tesseract_langcodes_with_prefix(prefix: str): +def _get_all_tesseract_langcodes_with_prefix(prefix: str) -> list[str]: """ Get all matching tesseract langcodes with this prefix (may be one or multiple variants). """ @@ -342,7 +373,9 @@ def apply_lang_metadata( languages: Optional[List[str]], detect_language_per_element: bool = False, ) -> Iterator[Element]: - """Detect and apply metadata.languages to each element in `elements`.""" + """Detect language and apply it to metadata.languages for each element in `elements`. + If languages is None, default to auto detection. + If languages is and empty string, skip.""" # -- Note this function has a stream interface, but reads the full `elements` stream into memory # -- before emitting the first updated element as output. @@ -359,6 +392,7 @@ def apply_lang_metadata( yield from elements return + # Convert elements to a list to get the text, detect the language, and add it to the elements if not isinstance(elements, List): elements = list(elements) @@ -369,7 +403,7 @@ def apply_lang_metadata( and len(languages) == 1 and detect_language_per_element is False ): - # -- apply detected language to each metadata -- + # -- apply detected language to each element's metadata -- for e in elements: e.metadata.languages = detected_languages yield e diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 3cc2a2063..721d10880 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -70,7 +70,7 @@ from unstructured.partition.common import ( spooled_to_bytes_io_if_needed, ) from unstructured.partition.lang import ( - check_languages, + check_language_args, prepare_languages_for_tesseract, ) from unstructured.partition.pdf_image.pdf_image_utils import ( @@ -208,7 +208,7 @@ def partition_pdf( exactly_one(filename=filename, file=file) - languages = check_languages(languages, ocr_languages) + languages = check_language_args(languages or [], ocr_languages) or ["eng"] return partition_pdf_or_image( filename=filename, @@ -494,8 +494,6 @@ def partition_pdf_or_image( validate_strategy(strategy, is_image) - languages = check_languages(languages, ocr_languages) - last_modification_date = get_the_last_modification_date_pdf_or_img( file=file, filename=filename,