Chore: Pass table support param to partition image (#973)

* add param and test in image table extraction * version and changelog * need to publish this one for api repo * add new param skip_infer_table_types * use warning * clean up with mapping * add test for tsv * fix test fail * weird change from merge * doc nit * don't use mapping * correct conflict
2025-12-04 11:10:22 +00:00 · 2023-07-27 13:33:36 -04:00 · 2023-07-27 13:33:36 -04:00 · d46c1c2d83
commit d46c1c2d83
parent e7f2f1e3eb
8 changed files with 68 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,8 @@
-## 0.8.5-dev2
+## 0.8.5

 ### Enhancements

+* Add parameter `skip_infer_table_types` to enable (skip) table extraction for other doc types
 * Adds optional Unstructured API unit tests in CI
 * Tracks last modified date for all document types.

--- a/example-docs/layout-parser-paper-with-table.jpg
+++ b/example-docs/layout-parser-paper-with-table.jpg
--- a/test_unstructured/file_utils/test_exploration.py
+++ b/test_unstructured/file_utils/test_exploration.py
@ -69,7 +69,7 @@ def test_get_file_info(tmpdir):
    assert isinstance(file_info, pd.DataFrame)
    assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}

-    means = file_info.groupby("filetype").mean()
+    means = file_info.groupby("filetype").mean(numeric_only=True)
    assert means.columns.to_list() == ["filesize"]


--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -771,6 +771,15 @@ def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.cs
    assert elements[0].metadata.filetype == "text/csv"


+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
+    elements = partition(filename=filename)
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert elements[0].metadata.filetype == "text/tsv"
+
+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
    with open(filename, "rb") as f:
--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@ -118,6 +118,19 @@ def test_partition_image_with_auto_strategy(
    assert titles[0].text == title


+def test_partition_image_with_table_extraction(
+    filename="example-docs/layout-parser-paper-with-table.jpg",
+):
+    elements = image.partition_image(
+        filename=filename,
+        strategy="hi_res",
+        infer_table_structure=True,
+    )
+    table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
+    assert len(table) == 1
+    assert "Layouts of history Japanese documents" in table[0]
+
+
 def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
    with mock.patch.object(
        layout,
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.8.5-dev2"  # pragma: no cover
+__version__ = "0.8.5"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -1,5 +1,5 @@
 import io
-from typing import IO, Callable, Dict, Optional, Tuple
+from typing import IO, Callable, Dict, List, Optional, Tuple

 import requests

@ -47,6 +47,7 @@ def partition(
    encoding: Optional[str] = None,
    paragraph_grouper: Optional[Callable[[str], str]] = None,
    headers: Dict[str, str] = {},
+    skip_infer_table_types: List[str] = ["pdf", "jpg", "png"],
    ssl_verify: bool = True,
    ocr_languages: str = "eng",
    pdf_infer_table_structure: bool = False,
@ -82,6 +83,8 @@ def partition(
        The encoding method used to decode the text input. If None, utf-8 will be used.
    headers
        The headers to be used in conjunction with the HTTP request if URL is set.
+    skip_infer_table_types
+        The document types that you want to skip table extraction with.
    ssl_verify
        If the URL parameter is set, determines whether or not partition uses SSL verification
        in the HTTP request.
@ -123,6 +126,12 @@ def partition(
    if file is not None:
        file.seek(0)

+    infer_table_structure = decide_table_extraction(
+        filetype,
+        skip_infer_table_types,
+        pdf_infer_table_structure,
+    )
+
    if filetype == FileType.DOC:
        elements = partition_doc(filename=filename, file=file, **kwargs)
    elif filetype == FileType.DOCX:
@ -183,7 +192,7 @@ def partition(
            file=file,  # type: ignore
            url=None,
            include_page_breaks=include_page_breaks,
-            infer_table_structure=pdf_infer_table_structure,
+            infer_table_structure=infer_table_structure,
            strategy=strategy,
            ocr_languages=ocr_languages,
            **kwargs,
@ -194,6 +203,7 @@ def partition(
            file=file,  # type: ignore
            url=None,
            include_page_breaks=include_page_breaks,
+            infer_table_structure=infer_table_structure,
            strategy=strategy,
            ocr_languages=ocr_languages,
            **kwargs,
@ -274,3 +284,22 @@ def file_and_type_from_url(

    filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding)
    return file, filetype
+
+
+def decide_table_extraction(
+    filetype: Optional[FileType],
+    skip_infer_table_types: List[str],
+    pdf_infer_table_structure: bool,
+) -> bool:
+    doc_type = filetype.name.lower() if filetype else None
+
+    if doc_type == "pdf":
+        if doc_type in skip_infer_table_types and pdf_infer_table_structure:
+            logger.warning(
+                f"Conflict between variables skip_infer_table_types: {skip_infer_table_types}"
+                f"and pdf_infer_table_structure: {pdf_infer_table_structure},"
+                "please reset skip_infer_table_types to turn on table extraction for PDFs.",
+            )
+        return not (doc_type in skip_infer_table_types) or pdf_infer_table_structure
+
+    return not (doc_type in skip_infer_table_types)
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -10,6 +10,7 @@ def partition_image(
    filename: str = "",
    file: Optional[bytes] = None,
    include_page_breaks: bool = False,
+    infer_table_structure: bool = False,
    ocr_languages: str = "eng",
    strategy: str = "auto",
    metadata_date: Optional[str] = None,
@ -23,6 +24,15 @@ def partition_image(
        A string defining the target filename path.
    file
        A file-like object as bytes --> open(filename, "rb").
+    include_page_breaks
+        If True, includes page breaks at the end of each page in the document.
+    infer_table_structure
+        Only applicable if `strategy=hi_res`.
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
    ocr_languages
        The languages to use for the Tesseract agent. To use a language, you'll first need
        to install the appropriate Tesseract language pack.
@ -44,6 +54,7 @@ def partition_image(
        file=file,
        is_image=True,
        include_page_breaks=include_page_breaks,
+        infer_table_structure=infer_table_structure,
        ocr_languages=ocr_languages,
        strategy=strategy,
        metadata_date=metadata_date,