chore: add hi_res_model_name kwarg (#2289)

Closes #2160 Explicitly adds `hi_res_model_name` as kwarg to relevant functions and notes that `model_name` is to be deprecated. Testing: ``` from unstructured.partition.auto import partition filename = "example-docs/DA-1p.pdf" elements = partition(filename, strategy="hi_res", hi_res_model_name="yolox") ``` --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: Steve Canny <stcanny@gmail.com> Co-authored-by: Christine Straub <christinemstraub@gmail.com> Co-authored-by: Yao You <yao@unstructured.io> Co-authored-by: Yao You <theyaoyou@gmail.com>
2025-12-03 02:29:52 +00:00 · 2023-12-22 09:06:54 -06:00 · 2023-12-22 09:06:54 -06:00 · 5c0043aa7d
commit 5c0043aa7d
parent 093a11d058
7 changed files with 109 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -24,6 +24,8 @@
 ### Fixes
 * **Enable --fields argument omission for elasticsearch connector** Solves two bugs where removing the optional parameter --fields broke the connector due to an integer processing error and using an elasticsearch config for a destination connector resulted in a serialization issue when optional parameter --fields was not provided.

+* **Add hi_res_model_name** Adds kwarg to relevant functions and add comments that model_name is to be deprecated.
+
 ## 0.11.5

 ### Enhancements
--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@ -536,6 +536,18 @@ def test_partition_image_uses_model_name():
        assert mockpartition.call_args.kwargs["model_name"]


+def test_partition_image_uses_hi_res_model_name():
+    with mock.patch.object(
+        pdf,
+        "_partition_pdf_or_image_local",
+    ) as mockpartition:
+        image.partition_image("example-docs/layout-parser-paper-fast.jpg", hi_res_model_name="test")
+        print(mockpartition.call_args)
+        assert "model_name" not in mockpartition.call_args.kwargs
+        assert "hi_res_model_name" in mockpartition.call_args.kwargs
+        assert mockpartition.call_args.kwargs["hi_res_model_name"] == "test"
+
+
@pytest.mark.parametrize(
    ("ocr_mode", "idx_title_element"),
    [
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -215,6 +215,40 @@ def test_partition_pdf_with_model_name(
        assert mock_process.call_args[1]["model_name"] == "checkbox"


+def test_partition_pdf_with_hi_res_model_name(
+    monkeypatch,
+    filename=example_doc_path("layout-parser-paper-fast.pdf"),
+):
+    monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
+    with mock.patch.object(
+        layout,
+        "process_file_with_model",
+        mock.MagicMock(),
+    ) as mock_process:
+        pdf.partition_pdf(
+            filename=filename, strategy=PartitionStrategy.HI_RES, hi_res_model_name="checkbox"
+        )
+        # unstructured-ingest uses `model_name` instead of `hi_res_model_name`
+        assert mock_process.call_args[1]["model_name"] == "checkbox"
+
+
+def test_partition_pdf_or_image_with_hi_res_model_name(
+    monkeypatch,
+    filename=example_doc_path("layout-parser-paper-fast.pdf"),
+):
+    monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
+    with mock.patch.object(
+        layout,
+        "process_file_with_model",
+        mock.MagicMock(),
+    ) as mock_process:
+        pdf.partition_pdf_or_image(
+            filename=filename, strategy=PartitionStrategy.HI_RES, hi_res_model_name="checkbox"
+        )
+        # unstructured-ingest uses `model_name` instead of `hi_res_model_name`
+        assert mock_process.call_args[1]["model_name"] == "checkbox"
+
+
 def test_partition_pdf_with_auto_strategy(
    filename=example_doc_path("layout-parser-paper-fast.pdf"),
 ):
@ -798,6 +832,22 @@ def test_partition_pdf_uses_model_name():
        assert mockpartition.call_args.kwargs["model_name"]


+def test_partition_pdf_uses_hi_res_model_name():
+    with mock.patch.object(
+        pdf,
+        "_partition_pdf_or_image_local",
+    ) as mockpartition:
+        pdf.partition_pdf(
+            example_doc_path("layout-parser-paper-fast.pdf"),
+            hi_res_model_name="test",
+            strategy=PartitionStrategy.HI_RES,
+        )
+
+        mockpartition.assert_called_once()
+        assert "hi_res_model_name" in mockpartition.call_args.kwargs
+        assert mockpartition.call_args.kwargs["hi_res_model_name"]
+
+
 def test_partition_pdf_word_bbox_not_char(
    filename=example_doc_path("interface-config-guide-p93.pdf"),
 ):
@ -863,6 +913,18 @@ def test_partition_model_name_default_to_None():
        pytest.fail("partition_pdf() raised AttributeError unexpectedly!")


+def test_partition_hi_res_model_name_default_to_None():
+    filename = example_doc_path("DA-1p.pdf")
+    try:
+        pdf.partition_pdf(
+            filename=filename,
+            strategy=PartitionStrategy.HI_RES,
+            hi_res_model_name=None,
+        )
+    except AttributeError:
+        pytest.fail("partition_pdf() raised AttributeError unexpectedly!")
+
+
@pytest.mark.parametrize(
    ("strategy", "ocr_func"),
    [
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -356,6 +356,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
        image_output_dir_path=ANY,
        strategy=PartitionStrategy.FAST,
        languages=None,
+        hi_res_model_name=None,
    )


--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -142,6 +142,8 @@ def partition(
    data_source_metadata: Optional[DataSourceMetadata] = None,
    metadata_filename: Optional[str] = None,
    request_timeout: Optional[int] = None,
+    hi_res_model_name: Optional[str] = None,
+    model_name: Optional[str] = None,  # to be deprecated
    **kwargs,
 ):
    """Partitions a document into its constituent elements. Will use libmagic to determine
@ -202,6 +204,11 @@ def partition(
    request_timeout
        The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and
        requests will block indefinitely.
+    hi_res_model_name
+        The layout detection model used when partitioning strategy is set to `hi_res`.
+    model_name
+        The layout detection model used when partitioning strategy is set to `hi_res`. To be
+        deprecated in favor of `hi_res_model_name`.
    """
    exactly_one(file=file, filename=filename, url=url)

@ -391,6 +398,7 @@ def partition(
            languages=languages,
            extract_images_in_pdf=pdf_extract_images,
            image_output_dir_path=pdf_image_output_dir_path,
+            hi_res_model_name=hi_res_model_name or model_name,
            **kwargs,
        )
    elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):
@ -402,6 +410,7 @@ def partition(
            infer_table_structure=infer_table_structure,
            strategy=strategy,
            languages=languages,
+            hi_res_model_name=hi_res_model_name or model_name,
            **kwargs,
        )
    elif filetype == FileType.TXT:
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -25,6 +25,7 @@ def partition_image(
    strategy: str = PartitionStrategy.HI_RES,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,
+    hi_res_model_name: Optional[str] = None,
    **kwargs,
 ) -> List[Element]:
    """Parses an image into a list of interpreted elements.
@ -55,6 +56,8 @@ def partition_image(
        The default strategy is `hi_res`.
    metadata_last_modified
        The last modified date for the document.
+    hi_res_model_name
+        The layout detection model used when partitioning strategy is set to `hi_res`.
    """
    exactly_one(filename=filename, file=file)

@ -89,5 +92,6 @@ def partition_image(
        languages=languages,
        strategy=strategy,
        metadata_last_modified=metadata_last_modified,
+        hi_res_model_name=hi_res_model_name,
        **kwargs,
    )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -143,6 +143,7 @@ def partition_pdf(
    extract_images_in_pdf: bool = False,
    extract_element_types: Optional[List[str]] = None,
    image_output_dir_path: Optional[str] = None,
+    hi_res_model_name: Optional[str] = None,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf document into a list of interpreted elements.
@ -182,6 +183,8 @@ def partition_pdf(
    image_output_dir_path
        Only applicable if `strategy=hi_res`.
        The path for saving images when using `extract_images_in_pdf` or `extract_element_types`.
+    hi_res_model_name
+        The layout detection model used when partitioning strategy is set to `hi_res`.
    """

    exactly_one(filename=filename, file=file)
@ -199,6 +202,7 @@ def partition_pdf(
        extract_images_in_pdf=extract_images_in_pdf,
        extract_element_types=extract_element_types,
        image_output_dir_path=image_output_dir_path,
+        hi_res_model_name=hi_res_model_name,
        **kwargs,
    )

@ -244,13 +248,14 @@ def _partition_pdf_or_image_local(
    include_page_breaks: bool = False,
    languages: Optional[List[str]] = None,
    ocr_mode: str = OCRMode.FULL_PAGE.value,
-    model_name: Optional[str] = None,
+    model_name: Optional[str] = None,  # to be deprecated in favor of `hi_res_model_name`
    metadata_last_modified: Optional[str] = None,
    pdf_text_extractable: bool = False,
    extract_images_in_pdf: bool = False,
    extract_element_types: Optional[List[str]] = None,
    image_output_dir_path: Optional[str] = None,
    pdf_image_dpi: Optional[int] = None,
+    hi_res_model_name: Optional[str] = None,
    analysis: bool = False,
    analyzed_image_output_dir_path: Optional[str] = None,
    **kwargs,
@ -275,10 +280,12 @@ def _partition_pdf_or_image_local(

    ocr_languages = prepare_languages_for_tesseract(languages)

-    model_name = model_name or default_hi_res_model(infer_table_structure)
+    hi_res_model_name = (
+        hi_res_model_name or model_name or default_hi_res_model(infer_table_structure)
+    )
    if pdf_image_dpi is None:
-        pdf_image_dpi = 300 if model_name == "chipper" else 200
-    if (pdf_image_dpi < 300) and (model_name == "chipper"):
+        pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200
+    if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"):
        logger.warning(
            "The Chipper model performs better when images are rendered with DPI >= 300 "
            f"(currently {pdf_image_dpi}).",
@ -288,7 +295,7 @@ def _partition_pdf_or_image_local(
        inferred_document_layout = process_file_with_model(
            filename,
            is_image=is_image,
-            model_name=model_name,
+            model_name=hi_res_model_name,
            pdf_image_dpi=pdf_image_dpi,
        )

@ -314,7 +321,7 @@ def _partition_pdf_or_image_local(
            extracted_layout=extracted_layout,
        )

-        if model_name.startswith("chipper"):
+        if hi_res_model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
            final_document_layout = merged_document_layout
        else:
@ -331,7 +338,7 @@ def _partition_pdf_or_image_local(
        inferred_document_layout = process_data_with_model(
            file,
            is_image=is_image,
-            model_name=model_name,
+            model_name=hi_res_model_name,
            pdf_image_dpi=pdf_image_dpi,
        )
        if hasattr(file, "seek"):
@ -347,7 +354,7 @@ def _partition_pdf_or_image_local(
            extracted_layout=extracted_layout,
        )

-        if model_name.startswith("chipper"):
+        if hi_res_model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
            final_document_layout = merged_document_layout
        else:
@ -364,7 +371,7 @@ def _partition_pdf_or_image_local(
            )

    # NOTE(alan): starting with v2, chipper sorts the elements itself.
-    if model_name == "chipper":
+    if hi_res_model_name == "chipper":
        kwargs["sort_mode"] = SORT_MODE_DONT

    final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
@ -434,7 +441,7 @@ def _partition_pdf_or_image_local(
            ).strip()
            # NOTE(alan): with chipper there are parent elements with no text we don't want to
            # filter those out and leave the children orphaned.
-            if el.text or isinstance(el, PageBreak) or model_name.startswith("chipper"):
+            if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
                out_elements.append(cast(Element, el))

    return out_elements
@ -453,6 +460,7 @@ def partition_pdf_or_image(
    extract_images_in_pdf: bool = False,
    extract_element_types: Optional[List[str]] = None,
    image_output_dir_path: Optional[str] = None,
+    hi_res_model_name: Optional[str] = None,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
@ -514,6 +522,7 @@ def partition_pdf_or_image(
                extract_images_in_pdf=extract_images_in_pdf,
                extract_element_types=extract_element_types,
                image_output_dir_path=image_output_dir_path,
+                hi_res_model_name=hi_res_model_name,
                **kwargs,
            )
            out_elements = _process_uncategorized_text_elements(elements)