Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
2025-12-28 15:45:21 +00:00 · 2023-08-02 09:22:20 -07:00 · 2023-08-02 09:22:20 -07:00 · a23d75a292
commit a23d75a292
parent 499f37f64b
6 changed files with 45 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,7 @@

 ### Enhancements

+* Set the default strategy for partitioning images to `hi_res`
 * Add page break parameter section in API documentation to sync with change in Prod API

 ### Features
@ -22,7 +23,6 @@

 ### Fixes

-
 * Rename "date" field to "last_modified"
 * Adds Box connector

--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -376,6 +376,26 @@ def test_partition_pdf_doesnt_raise_warning():
        partition(filename=filename, strategy="hi_res")


+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
+)
+def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, content_type):
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
+    file_filename = filename if pass_file_filename else None
+    elements = partition(
+        filename=filename,
+        file_filename=file_filename,
+        content_type=content_type,
+        strategy="auto",
+    )
+
+    # should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
+    first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
+    assert elements[0].text == first_line
+    assert elements[0].metadata.coordinates is not None
+
+
@pytest.mark.parametrize(
    ("pass_file_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@ -213,6 +213,16 @@ def test_partition_image_raises_with_bad_strategy():
        image.partition_image(filename=filename, strategy="fakeroo")


+def test_partition_image_default_strategy_hi_res():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg")
+    with open(filename, "rb") as f:
+        elements = image.partition_image(file=f)
+
+    first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
+    assert elements[0].text == first_line
+    assert elements[0].metadata.coordinates is not None
+
+
 def test_partition_image_metadata_date(
    mocker,
    filename="example-docs/english-and-korean.png",
--- a/test_unstructured/partition/test_strategies.py
+++ b/test_unstructured/partition/test_strategies.py
@ -43,18 +43,9 @@ def test_is_pdf_text_extractable(filename, from_file, expected):
    assert bool(extractable) is expected


-@pytest.mark.parametrize(
-    ("infer_table_structure", "expected"),
-    [
-        (True, "hi_res"),
-        (False, "ocr_only"),
-    ],
-)
-def test_determine_image_auto_strategy(infer_table_structure, expected):
-    strategy = strategies._determine_image_auto_strategy(
-        infer_table_structure=infer_table_structure,
-    )
-    assert strategy is expected
+def test_determine_image_auto_strategy():
+    strategy = strategies._determine_image_auto_strategy()
+    assert strategy == "hi_res"


@pytest.mark.parametrize(
@ -74,9 +65,9 @@ def test_determine_image_pdf_strategy(pdf_text_extractable, infer_table_structur
    assert strategy is expected


-def test_determine_pdf_or_image_strategy_fallback_ocr():
+def test_determine_pdf_or_image_strategy_fallback_hi_res():
    strategy = strategies.determine_pdf_or_image_strategy(
        strategy="fast",
        is_image=True,
    )
-    assert strategy == "ocr_only"
+    assert strategy == "hi_res"
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -12,7 +12,7 @@ def partition_image(
    include_page_breaks: bool = False,
    infer_table_structure: bool = False,
    ocr_languages: str = "eng",
-    strategy: str = "auto",
+    strategy: str = "hi_res",
    metadata_last_modified: Optional[str] = None,
    **kwargs,
 ) -> List[Element]:
@ -41,11 +41,11 @@ def partition_image(
        "ocr_only". When using the "hi_res" strategy, the function uses a layout detection
        model if to identify document elements. When using the "ocr_only" strategy,
        partition_image simply extracts the text from the document using OCR and processes it.
-        The default strategy `auto` will determine when a image can be extracted using
-        `ocr_only` mode, otherwise it will fall back to `hi_res`.
+        The default strategy is `hi_res`.
    metadata_last_modified
        The last modified date for the document.

+
    """
    exactly_one(filename=filename, file=file)

--- a/unstructured/partition/strategies.py
+++ b/unstructured/partition/strategies.py
@ -47,10 +47,9 @@ def determine_pdf_or_image_strategy(

    if is_image:
        # Note(yuming): There is no fast strategy for images,
-        # use ocr_only as a fallback plan for consistency with PDFs.
-        # This can be removed once unstructured-api use auto as the default strategy.
+        # use hi_res as a fallback plan since it is the auto default.
        if strategy == "fast":
-            strategy = "ocr_only"
+            strategy = "hi_res"
        validate_strategy(strategy, "image")
        pdf_text_extractable = False
    else:
@ -58,7 +57,7 @@ def determine_pdf_or_image_strategy(

    if strategy == "auto":
        if is_image:
-            strategy = _determine_image_auto_strategy(infer_table_structure=infer_table_structure)
+            strategy = _determine_image_auto_strategy()
        else:
            strategy = _determine_pdf_auto_strategy(
                pdf_text_extractable=pdf_text_extractable,
@ -115,13 +114,11 @@ def determine_pdf_or_image_strategy(
    return strategy


-def _determine_image_auto_strategy(infer_table_structure: bool = False):
+def _determine_image_auto_strategy():
    """If "auto" is passed in as the strategy, determines what strategy to use
    for images."""
-    if infer_table_structure is True:
-        return "hi_res"
-    else:
-        return "ocr_only"
+    # Use hi_res as the only default since images are only about one page
+    return "hi_res"


 def _determine_pdf_auto_strategy(