chore: improve kwarg handling (#1810)

Closes `unstructured-inference` issue [#265](https://github.com/Unstructured-IO/unstructured-inference/issues/265). Cleaned up the kwarg handling, taking opportunities to turn instances of handling kwargs as dicts to just using them as normal in function signatures. #### Testing: Should just pass CI.
2025-12-24 05:34:58 +00:00 · 2023-10-22 23:48:28 -05:00 · 2023-10-22 23:48:28 -05:00 · 7fdddfbc1e
commit 7fdddfbc1e
parent 82c8adba3f
19 changed files with 75 additions and 44 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,11 @@
+## 0.10.26-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
 ## 0.10.25

 ### Enhancements
--- a/requirements/embed-huggingface.txt
+++ b/requirements/embed-huggingface.txt
@ -76,9 +76,9 @@ jsonpatch==1.33
    # via langchain
 jsonpointer==2.4
    # via jsonpatch
-langchain==0.0.318
+langchain==0.0.320
    # via -r requirements/embed-huggingface.in
-langsmith==0.0.46
+langsmith==0.0.49
    # via langchain
 markupsafe==2.1.3
    # via jinja2
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@ -6,7 +6,7 @@ pdf2image
 pdfminer.six
 # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.7.9
+unstructured-inference==0.7.10
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@ -203,7 +203,7 @@ sympy==1.12
    # via
    #   onnxruntime
    #   torch
-timm==0.9.7
+timm==0.9.8
    # via effdet
 tokenizers==0.14.1
    # via transformers
@ -236,7 +236,7 @@ typing-extensions==4.8.0
    #   torch
 tzdata==2023.3
    # via pandas
-unstructured-inference==0.7.9
+unstructured-inference==0.7.10
    # via -r requirements/extra-pdf-image.in
 unstructured-pytesseract==0.3.12
    # via
--- a/requirements/ingest-azure-cognitive-search.txt
+++ b/requirements/ingest-azure-cognitive-search.txt
@ -6,7 +6,7 @@
 #
 azure-common==1.1.28
    # via azure-search-documents
-azure-core==1.29.4
+azure-core==1.29.5
    # via
    #   azure-search-documents
    #   msrest
--- a/requirements/ingest-azure.txt
+++ b/requirements/ingest-azure.txt
@ -14,7 +14,7 @@ async-timeout==4.0.3
    # via aiohttp
 attrs==23.1.0
    # via aiohttp
-azure-core==1.29.4
+azure-core==1.29.5
    # via
    #   adlfs
    #   azure-identity
--- a/requirements/ingest-bedrock.txt
+++ b/requirements/ingest-bedrock.txt
@ -61,9 +61,9 @@ jsonpatch==1.33
    # via langchain
 jsonpointer==2.4
    # via jsonpatch
-langchain==0.0.318
+langchain==0.0.320
    # via -r requirements/ingest-bedrock.in
-langsmith==0.0.46
+langsmith==0.0.49
    # via langchain
 marshmallow==3.20.1
    # via
--- a/requirements/ingest-confluence.txt
+++ b/requirements/ingest-confluence.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
 #
-atlassian-python-api==3.41.2
+atlassian-python-api==3.41.3
    # via -r requirements/ingest-confluence.in
 certifi==2023.7.22
    # via
--- a/requirements/ingest-jira.txt
+++ b/requirements/ingest-jira.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
 #
-atlassian-python-api==3.41.2
+atlassian-python-api==3.41.3
    # via -r requirements/ingest-jira.in
 certifi==2023.7.22
    # via
--- a/requirements/ingest-openai.txt
+++ b/requirements/ingest-openai.txt
@ -50,9 +50,9 @@ jsonpatch==1.33
    # via langchain
 jsonpointer==2.4
    # via jsonpatch
-langchain==0.0.318
+langchain==0.0.320
    # via -r requirements/ingest-openai.in
-langsmith==0.0.46
+langsmith==0.0.49
    # via langchain
 marshmallow==3.20.1
    # via
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -103,7 +103,7 @@ requests==2.31.0
    # via
    #   -c requirements/base.txt
    #   label-studio-sdk
-ruff==0.1.0
+ruff==0.1.1
    # via -r requirements/test.in
 six==1.16.0
    # via
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var(
        mock_process.assert_called_once_with(
            filename,
            is_image=False,
-            pdf_image_dpi=200,
+            pdf_image_dpi=mock.ANY,
+            extract_tables=mock.ANY,
            model_name="checkbox",
+            extract_images_in_pdf=mock.ANY,
+            image_output_dir_path=mock.ANY,
        )


@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name(
        mock_process.assert_called_once_with(
            filename,
            is_image=False,
-            pdf_image_dpi=200,
+            pdf_image_dpi=mock.ANY,
+            extract_tables=mock.ANY,
            model_name="checkbox",
+            extract_images_in_pdf=mock.ANY,
+            image_output_dir_path=mock.ANY,
        )


@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi():
        mock_process.assert_called_once_with(
            filename,
            is_image=False,
+            extract_tables=mock.ANY,
            model_name=pdf.default_hi_res_model(),
            pdf_image_dpi=100,
+            extract_images_in_pdf=mock.ANY,
+            image_output_dir_path=mock.ANY,
        )


--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -3,7 +3,7 @@ import os
 import pathlib
 import warnings
 from importlib import import_module
-from unittest.mock import patch
+from unittest.mock import ANY, patch

 import docx
 import pytest
@ -347,6 +347,8 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
        url=None,
        include_page_breaks=False,
        infer_table_structure=False,
+        extract_images_in_pdf=ANY,
+        image_output_dir_path=ANY,
        strategy="fast",
        languages=None,
    )
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.25"  # pragma: no cover
+__version__ = "0.10.26-dev0"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -135,6 +135,8 @@ def partition(
    languages: Optional[List[str]] = None,
    detect_language_per_element: bool = False,
    pdf_infer_table_structure: bool = False,
+    pdf_extract_images: bool = False,
+    pdf_image_output_dir_path: Optional[str] = None,
    xml_keep_tags: bool = False,
    data_source_metadata: Optional[DataSourceMetadata] = None,
    metadata_filename: Optional[str] = None,
@ -186,6 +188,12 @@ def partition(
        additional metadata field, "text_as_html," where the value (string) is a just a
        transformation of the data into an HTML <table>.
        The "text" field for a partitioned Table Element is always present, whether True or False.
+    pdf_extract_images
+        If True and strategy=hi_res, any detected images will be saved in the path specified by
+        pdf_image_output_dir_path.
+    pdf_image_output_dir_path
+        If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
+        given path
    xml_keep_tags
        If True, will retain the XML tags in the output. Otherwise it will simply extract
        the text from within the tags. Only applies to partition_xml.
@ -367,6 +375,8 @@ def partition(
            infer_table_structure=infer_table_structure,
            strategy=strategy,
            languages=languages,
+            extract_images_in_pdf=pdf_extract_images,
+            image_output_dir_path=pdf_image_output_dir_path,
            **kwargs,
        )
    elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -35,10 +35,7 @@ from unstructured.documents.elements import (
 )
 from unstructured.logger import logger
 from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
-from unstructured.partition.utils.constants import (
-    SORT_MODE_DONT,
-    SORT_MODE_XY_CUT,
-)
+from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
 from unstructured.utils import dependency_exists, first

 if dependency_exists("docx") and dependency_exists("docx.table"):
@ -551,11 +548,11 @@ def document_to_element_list(
    infer_list_items: bool = True,
    source_format: Optional[str] = None,
    detection_origin: Optional[str] = None,
+    sort_mode: str = SORT_MODE_XY_CUT,
    **kwargs,
 ) -> List[Element]:
    """Converts a DocumentLayout object to a list of unstructured elements."""
    elements: List[Element] = []
-    sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)

    num_pages = len(document.pages)
    for i, page in enumerate(document.pages):
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -100,6 +100,8 @@ def partition_pdf(
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,
    links: Sequence[Link] = [],
+    extract_images_in_pdf: bool = False,
+    image_output_dir_path: Optional[str] = None,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf document into a list of interpreted elements.
@ -135,6 +137,12 @@ def partition_pdf(
        processing text/plain content.
    metadata_last_modified
        The last modified date for the document.
+    extract_images_in_pdf
+        If True and strategy=hi_res, any detected images will be saved in the path specified by
+        image_output_dir_path.
+    image_output_dir_path
+        If extract_images_in_pdf=True and strategy=hi_res, any detected images will be saved in the
+        given path
    """
    exactly_one(filename=filename, file=file)

@ -164,6 +172,8 @@ def partition_pdf(
        max_partition=max_partition,
        min_partition=min_partition,
        metadata_last_modified=metadata_last_modified,
+        extract_images_in_pdf=extract_images_in_pdf,
+        image_output_dir_path=image_output_dir_path,
        **kwargs,
    )

@ -210,6 +220,8 @@ def partition_pdf_or_image(
    max_partition: Optional[int] = 1500,
    min_partition: Optional[int] = 0,
    metadata_last_modified: Optional[str] = None,
+    extract_images_in_pdf: bool = False,
+    image_output_dir_path: Optional[str] = None,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
@ -292,6 +304,8 @@ def partition_pdf_or_image(
                include_page_breaks=include_page_breaks,
                languages=languages,
                metadata_last_modified=metadata_last_modified or last_modification_date,
+                extract_images_in_pdf=extract_images_in_pdf,
+                image_output_dir_path=image_output_dir_path,
                **kwargs,
            )
            layout_elements = []
@ -334,6 +348,9 @@ def _partition_pdf_or_image_local(
    ocr_mode: str = OCRMode.FULL_PAGE.value,
    model_name: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
+    extract_images_in_pdf: bool = False,
+    image_output_dir_path: Optional[str] = None,
+    pdf_image_dpi: Optional[int] = None,
    **kwargs,
 ) -> List[Element]:
    """Partition using package installed locally."""
@ -350,7 +367,6 @@ def _partition_pdf_or_image_local(
    ocr_languages = prepare_languages_for_tesseract(languages)

    model_name = model_name or default_hi_res_model()
-    pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
    if pdf_image_dpi is None:
        pdf_image_dpi = 300 if model_name == "chipper" else 200
    if (pdf_image_dpi < 300) and (model_name == "chipper"):
@ -359,27 +375,16 @@ def _partition_pdf_or_image_local(
            f"(currently {pdf_image_dpi}).",
        )

-    # NOTE(christine): Need to extract images from PDF's
-    extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
-    image_output_dir_path = kwargs.get("image_output_dir_path", None)
-    process_with_model_extra_kwargs = {
-        "extract_images_in_pdf": extract_images_in_pdf,
-        "image_output_dir_path": image_output_dir_path,
-    }
-
-    process_with_model_kwargs = {}
-    for key, value in process_with_model_extra_kwargs.items():
-        if value:
-            process_with_model_kwargs[key] = value
-
    if file is None:
        # NOTE(christine): out_layout = extracted_layout + inferred_layout
        out_layout = process_file_with_model(
            filename,
            is_image=is_image,
+            extract_tables=infer_table_structure,
            model_name=model_name,
            pdf_image_dpi=pdf_image_dpi,
-            **process_with_model_kwargs,
+            extract_images_in_pdf=extract_images_in_pdf,
+            image_output_dir_path=image_output_dir_path,
        )
        if model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
@ -398,9 +403,11 @@ def _partition_pdf_or_image_local(
        out_layout = process_data_with_model(
            file,
            is_image=is_image,
+            extract_tables=infer_table_structure,
            model_name=model_name,
            pdf_image_dpi=pdf_image_dpi,
-            **process_with_model_kwargs,
+            extract_images_in_pdf=extract_images_in_pdf,
+            image_output_dir_path=image_output_dir_path,
        )
        if model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
@ -528,11 +535,11 @@ def _process_pdfminer_pages(
    filename: str = "",
    include_page_breaks: bool = False,
    metadata_last_modified: Optional[str] = None,
+    sort_mode: str = SORT_MODE_XY_CUT,
    **kwargs,
 ):
    """Uses PDF miner to split a document into pages and process them."""
    elements: List[Element] = []
-    sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
--- a/unstructured/partition/utils/constants.py
+++ b/unstructured/partition/utils/constants.py
@ -11,6 +11,7 @@ SORT_MODE_XY_CUT = "xy-cut"
 SORT_MODE_BASIC = "basic"
 SORT_MODE_DONT = "dont"

+
 SUBREGION_THRESHOLD_FOR_OCR = 0.5
 UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)

--- a/unstructured/partition/utils/sorting.py
+++ b/unstructured/partition/utils/sorting.py
@ -5,10 +5,7 @@ import numpy as np

 from unstructured.documents.elements import CoordinatesMetadata, Element
 from unstructured.logger import trace_logger
-from unstructured.partition.utils.constants import (
-    SORT_MODE_BASIC,
-    SORT_MODE_XY_CUT,
-)
+from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
 from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped