chore: change table param name (#513)

Updated parameter names that controls whether we try to infer table structure.
2025-11-30 09:09:53 +00:00 · 2023-04-21 13:48:19 -05:00 · 2023-04-21 13:48:19 -05:00 · 5b6640a55a
commit 5b6640a55a
parent ba59ad6b3a
6 changed files with 36 additions and 23 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,13 @@
+## 0.6.1
+
+### Enhancements
+
+* Updated the table extraction parameter name to be more descriptive
+
+### Features
+
+### Fixes
+
 ## 0.6.0

 ### Enhancements
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction():
    with patch(
        "unstructured_inference.inference.layout.process_file_with_model",
    ) as mock_process_file_with_model:
-        partition(filename, pdf_extract_tables=True)
+        partition(filename, pdf_infer_table_structure=True)
        assert mock_process_file_with_model.call_args[1]["extract_tables"]


@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy():
        url=None,
        include_page_breaks=False,
        encoding="utf-8",
-        extract_tables=False,
+        infer_table_structure=False,
        strategy="fast",
        ocr_languages="eng",
    )
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction():
    with mock.patch(
        "unstructured_inference.inference.layout.process_file_with_model",
    ) as mock_process_file_with_model:
-        pdf.partition_pdf(filename, extract_tables=True)
+        pdf.partition_pdf(filename, infer_table_structure=True)
        assert mock_process_file_with_model.call_args[1]["extract_tables"]
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.0"  # pragma: no cover
+__version__ = "0.6.1"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -35,7 +35,7 @@ def partition(
    headers: Dict[str, str] = {},
    ssl_verify: bool = True,
    ocr_languages: str = "eng",
-    pdf_extract_tables: bool = False,
+    pdf_infer_table_structure: bool = False,
 ):
    """Partitions a document into its constituent elements. Will use libmagic to determine
    the file's type and route it to the appropriate partitioning function. Applies the default
@ -71,9 +71,11 @@ def partition(
    ocr_languages
        The languages to use for the Tesseract agent. To use a language, you'll first need
        to isntall the appropriate Tesseract language pack.
-    pdf_extract_tables
-        If True, in the case that the file to be processed is detected to be a PDF, any tables that
-        are detected will be extracted.
+    pdf_infer_table_structure
+        If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
+        additional metadata field, "text_as_html," where the value (string) is a just a
+        transformation of the data into an HTML <table>.
+        The "text" field for a partitioned Table Element is always present, whether True or False.
    """
    exactly_one(file=file, filename=filename, url=url)

@ -134,7 +136,7 @@ def partition(
            url=None,
            include_page_breaks=include_page_breaks,
            encoding=encoding,
-            extract_tables=pdf_extract_tables,
+            infer_table_structure=pdf_infer_table_structure,
            strategy=strategy,
            ocr_languages=ocr_languages,
        )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -22,7 +22,7 @@ def partition_pdf(
    token: Optional[str] = None,
    include_page_breaks: bool = False,
    strategy: str = "hi_res",
-    extract_tables: bool = False,
+    infer_table_structure: bool = False,
    encoding: str = "utf-8",
    ocr_languages: str = "eng",
 ) -> List[Element]:
@ -45,12 +45,13 @@ def partition_pdf(
        The strategy to use for partitioning the PDF. Uses a layout detection model if set
        to 'hi_res', otherwise partition_pdf simply extracts the text from the document
        and processes it.
-    extract_tables
-        If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this
-        is True or False, the partitioning process will attempt to identify any tables in the
-        document. This parameter indicates that the partitioning process will attempt to extract the
-        structure of any identified tables. The table structure and cell contents will be stored as
-        HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html
+    infer_table_structure
+        Only applicable if `strategy=hi_res`.
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    ocr_languages
@ -66,7 +67,7 @@ def partition_pdf(
        token=token,
        include_page_breaks=include_page_breaks,
        strategy=strategy,
-        extract_tables=extract_tables,
+        infer_table_structure=infer_table_structure,
        encoding=encoding,
        ocr_languages=ocr_languages,
    )
@ -81,7 +82,7 @@ def partition_pdf_or_image(
    is_image: bool = False,
    include_page_breaks: bool = False,
    strategy: str = "hi_res",
-    extract_tables: bool = False,
+    infer_table_structure: bool = False,
    encoding: str = "utf-8",
    ocr_languages: str = "eng",
 ) -> List[Element]:
@ -117,7 +118,7 @@ def partition_pdf_or_image(
                    file=file,
                    template=out_template,
                    is_image=is_image,
-                    extract_tables=extract_tables,
+                    infer_table_structure=infer_table_structure,
                    include_page_breaks=True,
                    ocr_languages=ocr_languages,
                )
@ -128,7 +129,7 @@ def partition_pdf_or_image(
                    "detectron2 is not installed. Cannot use the hi_res partitioning "
                    "strategy. Falling back to partitioning with the fast strategy.",
                )
-            if extract_tables:
+            if infer_table_structure:
                logger.warning(
                    "Table extraction was selected, but is being ignored while using the fast "
                    "strategy.",
@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
    file: Optional[bytes] = None,
    template: Optional[str] = None,
    is_image: bool = False,
-    extract_tables: bool = False,
+    infer_table_structure: bool = False,
    include_page_breaks: bool = False,
    ocr_languages: str = "eng",
 ) -> List[Element]:
@ -204,7 +205,7 @@ def _partition_pdf_or_image_local(
            template,
            is_image=is_image,
            ocr_languages=ocr_languages,
-            extract_tables=extract_tables,
+            extract_tables=infer_table_structure,
        )
    else:
        layout = process_data_with_model(
@ -212,7 +213,7 @@ def _partition_pdf_or_image_local(
            template,
            is_image=is_image,
            ocr_languages=ocr_languages,
-            extract_tables=extract_tables,
+            extract_tables=infer_table_structure,
        )

    return document_to_element_list(layout, include_page_breaks=include_page_breaks)