chore: change table param name (#513)

Updated parameter names that controls whether we try to infer table structure.
2025-10-02 20:07:27 +00:00 · 2023-04-21 13:48:19 -05:00 · 2023-04-21 13:48:19 -05:00 · 5b6640a55a
commit 5b6640a55a
parent ba59ad6b3a
6 changed files with 36 additions and 23 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,13 @@
 ## 0.6.1
 ### Enhancements
 * Updated the table extraction parameter name to be more descriptive
 ### Features
 ### Fixes
 ## 0.6.0
 ### Enhancements
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction():
    with patch(
        "unstructured_inference.inference.layout.process_file_with_model",
    ) as mock_process_file_with_model:
-        partition(filename, pdf_extract_tables=True)
+        partition(filename, pdf_infer_table_structure=True)
        assert mock_process_file_with_model.call_args[1]["extract_tables"]
@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy():
        url=None,
        include_page_breaks=False,
        encoding="utf-8",
-        extract_tables=False,
+        infer_table_structure=False,
        strategy="fast",
        ocr_languages="eng",
    )
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction():
    with mock.patch(
        "unstructured_inference.inference.layout.process_file_with_model",
    ) as mock_process_file_with_model:
-        pdf.partition_pdf(filename, extract_tables=True)
+        pdf.partition_pdf(filename, infer_table_structure=True)
        assert mock_process_file_with_model.call_args[1]["extract_tables"]
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.0"  # pragma: no cover
+__version__ = "0.6.1"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -35,7 +35,7 @@ def partition(
    headers: Dict[str, str] = {},
    ssl_verify: bool = True,
    ocr_languages: str = "eng",
-    pdf_extract_tables: bool = False,
+    pdf_infer_table_structure: bool = False,
 ):
    """Partitions a document into its constituent elements. Will use libmagic to determine
    the file's type and route it to the appropriate partitioning function. Applies the default
@ -71,9 +71,11 @@ def partition(
    ocr_languages
        The languages to use for the Tesseract agent. To use a language, you'll first need
        to isntall the appropriate Tesseract language pack.
-    pdf_extract_tables
+    pdf_infer_table_structure
-        If True, in the case that the file to be processed is detected to be a PDF, any tables that
+        If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
-        are detected will be extracted.
+        additional metadata field, "text_as_html," where the value (string) is a just a
        transformation of the data into an HTML <table>.
        The "text" field for a partitioned Table Element is always present, whether True or False.
    """
    exactly_one(file=file, filename=filename, url=url)
@ -134,7 +136,7 @@ def partition(
            url=None,
            include_page_breaks=include_page_breaks,
            encoding=encoding,
-            extract_tables=pdf_extract_tables,
+            infer_table_structure=pdf_infer_table_structure,
            strategy=strategy,
            ocr_languages=ocr_languages,
        )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -22,7 +22,7 @@ def partition_pdf(
    token: Optional[str] = None,
    include_page_breaks: bool = False,
    strategy: str = "hi_res",
-    extract_tables: bool = False,
+    infer_table_structure: bool = False,
    encoding: str = "utf-8",
    ocr_languages: str = "eng",
 ) -> List[Element]:
@ -45,12 +45,13 @@ def partition_pdf(
        The strategy to use for partitioning the PDF. Uses a layout detection model if set
        to 'hi_res', otherwise partition_pdf simply extracts the text from the document
        and processes it.
-    extract_tables
+    infer_table_structure
-        If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this
+        Only applicable if `strategy=hi_res`.
-        is True or False, the partitioning process will attempt to identify any tables in the
+        If True, any Table elements that are extracted will also have a metadata field
-        document. This parameter indicates that the partitioning process will attempt to extract the
+        named "text_as_html" where the table's text content is rendered into an html string.
-        structure of any identified tables. The table structure and cell contents will be stored as
+        I.e., rows and cells are preserved.
-        HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html
+        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    ocr_languages
@ -66,7 +67,7 @@ def partition_pdf(
        token=token,
        include_page_breaks=include_page_breaks,
        strategy=strategy,
-        extract_tables=extract_tables,
+        infer_table_structure=infer_table_structure,
        encoding=encoding,
        ocr_languages=ocr_languages,
    )
@ -81,7 +82,7 @@ def partition_pdf_or_image(
    is_image: bool = False,
    include_page_breaks: bool = False,
    strategy: str = "hi_res",
-    extract_tables: bool = False,
+    infer_table_structure: bool = False,
    encoding: str = "utf-8",
    ocr_languages: str = "eng",
 ) -> List[Element]:
@ -117,7 +118,7 @@ def partition_pdf_or_image(
                    file=file,
                    template=out_template,
                    is_image=is_image,
-                    extract_tables=extract_tables,
+                    infer_table_structure=infer_table_structure,
                    include_page_breaks=True,
                    ocr_languages=ocr_languages,
                )
@ -128,7 +129,7 @@ def partition_pdf_or_image(
                    "detectron2 is not installed. Cannot use the hi_res partitioning "
                    "strategy. Falling back to partitioning with the fast strategy.",
                )
-            if extract_tables:
+            if infer_table_structure:
                logger.warning(
                    "Table extraction was selected, but is being ignored while using the fast "
                    "strategy.",
@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
    file: Optional[bytes] = None,
    template: Optional[str] = None,
    is_image: bool = False,
-    extract_tables: bool = False,
+    infer_table_structure: bool = False,
    include_page_breaks: bool = False,
    ocr_languages: str = "eng",
 ) -> List[Element]:
@ -204,7 +205,7 @@ def _partition_pdf_or_image_local(
            template,
            is_image=is_image,
            ocr_languages=ocr_languages,
-            extract_tables=extract_tables,
+            extract_tables=infer_table_structure,
        )
    else:
        layout = process_data_with_model(
@ -212,7 +213,7 @@ def _partition_pdf_or_image_local(
            template,
            is_image=is_image,
            ocr_languages=ocr_languages,
-            extract_tables=extract_tables,
+            extract_tables=infer_table_structure,
        )
    return document_to_element_list(layout, include_page_breaks=include_page_breaks)
`@ -1 +1 @@`
	`__version__ = "0.6.0" # pragma: no cover`	`__version__ = "0.6.1" # pragma: no cover`