chore: fix infer_table bug (#1833)

Carrying `skip_infer_table_types` to `infer_table_structure` in partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a `text_as_html` field. Note: I've continued to exclude this var from partitioners that go through html flow, I think if we've already got the html it doesn't make sense to carry the infer variable along, since we're not 'infer-ing' the html table in these cases. TODO: ✅ add unit tests --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
2025-08-15 20:27:37 +00:00 · 2023-10-23 17:11:53 -07:00 · 2023-10-23 17:11:53 -07:00 · 0584e1d031
commit 0584e1d031
parent 6707cab250
20 changed files with 246 additions and 38 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.10.26-dev1
+## 0.10.26-dev2

 ### Enhancements

@ -10,6 +10,8 @@

 ### Fixes

+* **Fix a bug on Table partitioning** Previously the `skip_infer_table_types` variable used in partition was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable in partition to pass the filetype you want to exclude `text_as_html` metadata field for, or the `infer_table_structure` boolean variable on the file specific partitioning function.
+
 ## 0.10.25

 ### Enhancements
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@ -460,7 +460,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens
 Table Extraction for other filetypes
 ------------------------------------

-We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
+We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs, Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction for Images and PDFs only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:

 .. tabs::

--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@ -28,7 +28,7 @@ def test_it_splits_a_large_section_into_multiple_chunks():
        Title("Introduction"),
        Text(
            "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
-            " porta volutpat."
+            " porta volutpat.",
        ),
    ]

--- a/test_unstructured/partition/csv/test_csv.py
+++ b/test_unstructured/partition/csv/test_csv.py
@ -35,6 +35,24 @@ def test_partition_csv_from_filename(filename, expected_text, expected_table):
    assert elements[0].metadata.filename == filename


+@pytest.mark.parametrize(
+    "infer_table_structure",
+    [
+        True,
+        False,
+    ],
+)
+def test_partition_csv_from_filename_infer_table_structure(infer_table_structure):
+    f_path = "example-docs/stanley-cups.csv"
+    elements = partition_csv(filename=f_path, infer_table_structure=infer_table_structure)
+
+    table_element_has_text_as_html_field = (
+        hasattr(elements[0].metadata, "text_as_html")
+        and elements[0].metadata.text_as_html is not None
+    )
+    assert table_element_has_text_as_html_field == infer_table_structure
+
+
 def test_partition_csv_from_filename_with_metadata_filename(
    filename="example-docs/stanley-cups.csv",
 ):
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@ -74,6 +74,25 @@ def test_partition_docx_from_file(mock_document, expected_elements, tmpdir):
        assert element.metadata.filename is None


+@pytest.mark.parametrize(
+    "infer_table_structure",
+    [
+        True,
+        False,
+    ],
+)
+def test_partition_docx_infer_table_structure(infer_table_structure):
+    elements = partition_docx(
+        filename="example-docs/fake_table.docx",
+        infer_table_structure=infer_table_structure,
+    )
+    table_element_has_text_as_html_field = (
+        hasattr(elements[0].metadata, "text_as_html")
+        and elements[0].metadata.text_as_html is not None
+    )
+    assert table_element_has_text_as_html_field == infer_table_structure
+
+
 def test_partition_docx_from_file_with_metadata_filename(mock_document, expected_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_document.save(filename)
@ -265,6 +284,7 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: List[Dic
        None,
        None,
        False,
+        True,
        None,
    )
    paragraph = partitioner._document.paragraphs[1]
@ -289,6 +309,7 @@ def test_iter_table_emphasis(expected_emphasized_texts: List[Dict[str, str]]):
        None,
        None,
        False,
+        True,
        None,
    )
    table = partitioner._document.tables[0]
@ -305,6 +326,7 @@ def test_table_emphasis(
        None,
        None,
        False,
+        True,
        None,
    )
    table = partitioner._document.tables[0]
@ -350,7 +372,14 @@ def test_partition_docx_with_json(mock_document, tmpdir):


 def test_parse_category_depth_by_style():
-    partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None)
+    partitioner = _DocxPartitioner(
+        "example-docs/category-level.docx",
+        None,
+        None,
+        False,
+        True,
+        None,
+    )

    # Category depths are 0-indexed and relative to the category type
    # Title, list item, bullet, narrative text, etc.
@ -381,7 +410,7 @@ def test_parse_category_depth_by_style():


 def test_parse_category_depth_by_style_name():
-    partitioner = _DocxPartitioner(None, None, None, False, None)
+    partitioner = _DocxPartitioner(None, None, None, False, True, None)

    test_cases = [
        (0, "Heading 1"),
@ -406,7 +435,7 @@ def test_parse_category_depth_by_style_name():


 def test_parse_category_depth_by_style_ilvl():
-    partitioner = _DocxPartitioner(None, None, None, False, None)
+    partitioner = _DocxPartitioner(None, None, None, False, True, None)
    assert partitioner._parse_category_depth_by_style_ilvl() == 0


--- a/test_unstructured/partition/odt/test_odt.py
+++ b/test_unstructured/partition/odt/test_odt.py
@ -1,6 +1,8 @@
 import os
 import pathlib

+import pytest
+
 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 from unstructured.chunking.title import chunk_by_title
 from unstructured.documents.elements import Table, TableChunk, Title
@ -54,6 +56,24 @@ def test_partition_odt_from_file():
    ]


+@pytest.mark.parametrize(
+    "infer_table_structure",
+    [
+        True,
+        False,
+    ],
+)
+def test_partition_odt_infer_table_structure(infer_table_structure):
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    with open(filename, "rb") as f:
+        elements = partition_odt(file=f, infer_table_structure=infer_table_structure)
+    table_element_has_text_as_html_field = (
+        hasattr(elements[1].metadata, "text_as_html")
+        and elements[1].metadata.text_as_html is not None
+    )
+    assert table_element_has_text_as_html_field == infer_table_structure
+
+
 def test_partition_odt_from_file_with_metadata_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
    with open(filename, "rb") as f:
--- a/test_unstructured/partition/pptx/test_pptx.py
+++ b/test_unstructured/partition/pptx/test_pptx.py
@ -262,6 +262,26 @@ def test_partition_pptx_grabs_tables():
    assert elements[1].metadata.filename == "fake-power-point-table.pptx"


+@pytest.mark.parametrize(
+    "infer_table_structure",
+    [
+        True,
+        False,
+    ],
+)
+def test_partition_pptx_infer_table_structure(infer_table_structure):
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
+    elements = cast(
+        Sequence[Text],
+        partition_pptx(filename=filename, infer_table_structure=infer_table_structure),
+    )
+    table_element_has_text_as_html_field = (
+        hasattr(elements[1].metadata, "text_as_html")
+        and elements[1].metadata.text_as_html is not None
+    )
+    assert table_element_has_text_as_html_field == infer_table_structure
+
+
 def test_partition_pptx_malformed():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
    elements = cast(Sequence[Text], partition_pptx(filename=filename))
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -713,7 +713,7 @@ EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsh


 def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
-    elements = partition(filename=filename, include_header=False)
+    elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])

    assert sum(isinstance(element, Table) for element in elements) == 2
    assert sum(isinstance(element, Title) for element in elements) == 2
@ -726,9 +726,36 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE


+@pytest.mark.parametrize(
+    ("skip_infer_table_types", "filename", "has_text_as_html_field"),
+    [
+        (["xlsx"], "stanley-cups.xlsx", False),
+        ([], "stanley-cups.xlsx", True),
+        (["odt"], "fake.odt", False),
+        ([], "fake.odt", True),
+    ],
+)
+def test_auto_partition_respects_skip_infer_table_types(
+    skip_infer_table_types, filename, has_text_as_html_field
+):
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
+    with open(filename, "rb") as f:
+        table_elements = [
+            e
+            for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
+            if isinstance(e, Table)
+        ]
+        for table_element in table_elements:
+            table_element_has_text_as_html_field = (
+                hasattr(table_element.metadata, "text_as_html")
+                and table_element.metadata.text_as_html is not None
+            )
+        assert table_element_has_text_as_html_field == has_text_as_html_field
+
+
 def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
    with open(filename, "rb") as f:
-        elements = partition(file=f, include_header=False)
+        elements = partition(file=f, include_header=False, skip_infer_table_types=[])

    assert sum(isinstance(element, Table) for element in elements) == 2
    assert sum(isinstance(element, Title) for element in elements) == 2
@ -834,7 +861,7 @@ EXPECTED_XLS_TABLE = (

@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
-    elements = partition(filename=filename, include_header=False)
+    elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])

    assert sum(isinstance(element, Table) for element in elements) == 2
    assert len(elements) == 18
--- a/test_unstructured/partition/xlsx/test_xlsx.py
+++ b/test_unstructured/partition/xlsx/test_xlsx.py
@ -51,6 +51,27 @@ def test_partition_xlsx_from_filename_with_metadata_filename(
    assert elements[0].metadata.filename == "test"


+@pytest.mark.parametrize(
+    "infer_table_structure",
+    [
+        True,
+        False,
+    ],
+)
+def test_partition_xlsx_infer_table_structure(
+    infer_table_structure,
+    filename="example-docs/stanley-cups.xlsx",
+):
+    elements = partition_xlsx(filename=filename, infer_table_structure=infer_table_structure)
+    table_elements = [e for e in elements if isinstance(e, Table)]
+    for table_element in table_elements:
+        table_element_has_text_as_html_field = (
+            hasattr(table_element.metadata, "text_as_html")
+            and table_element.metadata.text_as_html is not None
+        )
+        assert table_element_has_text_as_html_field == infer_table_structure
+
+
 def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
    elements = partition_xlsx(filename=filename, include_header=True)
    assert sum(isinstance(element, Table) for element in elements) == 2
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
@ -107,8 +107,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Stanley Cups",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Stanley Cups"
    },
    "text": "Stanley Cups"
  },
@ -220,8 +219,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Stanley Cups",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Stanley Cups"
    },
    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
  },
@ -333,8 +331,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Stanley Cups Since 67",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Stanley Cups Since 67"
    },
    "text": "Stanley Cups Since 67"
  },
@ -446,8 +443,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Stanley Cups Since 67",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Stanley Cups Since 67"
    },
    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
  }
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
@ -18,8 +18,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Stanley Cups",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Stanley Cups"
    },
    "text": "Stanley Cups"
  },
@ -42,8 +41,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Stanley Cups",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Stanley Cups"
    },
    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
  },
@ -66,8 +64,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Stanley Cups Since 67",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Stanley Cups Since 67"
    },
    "text": "Stanley Cups Since 67"
  },
@ -90,8 +87,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Stanley Cups Since 67",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Stanley Cups Since 67"
    },
    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
  }
--- a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
+++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
@ -18,8 +18,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Example Test",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>MC</td>\n      <td>What is 2+2?</td>\n      <td>4</td>\n      <td>correct</td>\n      <td>3</td>\n      <td>incorrect</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>MA</td>\n      <td>What C datatypes are 8 bits? (assume i386)</td>\n      <td>int</td>\n      <td></td>\n      <td>float</td>\n      <td></td>\n      <td>double</td>\n      <td></td>\n      <td>char</td>\n    </tr>\n    <tr>\n      <td>TF</td>\n      <td>Bagpipes are awesome.</td>\n      <td>true</td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>ESS</td>\n      <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>ORD</td>\n      <td>Rank the following in their order of operation.</td>\n      <td>Parentheses</td>\n      <td>Exponents</td>\n      <td>Division</td>\n      <td>Addition</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>FIB</td>\n      <td>The student activities fee is</td>\n      <td>95</td>\n      <td>dollars for students enrolled in</td>\n      <td>19</td>\n      <td>units or more,</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>MAT</td>\n      <td>Match the lower-case greek letter with its capital form.</td>\n      <td>λ</td>\n      <td>Λ</td>\n      <td>α</td>\n      <td>γ</td>\n      <td>Γ</td>\n      <td>φ</td>\n      <td>Φ</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Example Test"
    },
    "text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
  },
@ -88,8 +87,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Format Abbr.",
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Abbreviation</td>\n      <td>Question Type</td>\n    </tr>\n    <tr>\n      <td>MC</td>\n      <td>Multiple Choice</td>\n    </tr>\n    <tr>\n      <td>MA</td>\n      <td>Multiple Answer</td>\n    </tr>\n    <tr>\n      <td>TF</td>\n      <td>True/False</td>\n    </tr>\n    <tr>\n      <td>ESS</td>\n      <td>Essay</td>\n    </tr>\n    <tr>\n      <td>ORD</td>\n      <td>Ordering</td>\n    </tr>\n    <tr>\n      <td>MAT</td>\n      <td>Matching</td>\n    </tr>\n    <tr>\n      <td>FIB</td>\n      <td>Fill in the Blank</td>\n    </tr>\n    <tr>\n      <td>FIL</td>\n      <td>File response</td>\n    </tr>\n    <tr>\n      <td>NUM</td>\n      <td>Numeric Response</td>\n    </tr>\n    <tr>\n      <td>SR</td>\n      <td>Short response</td>\n    </tr>\n    <tr>\n      <td>OP</td>\n      <td>Opinion</td>\n    </tr>\n    <tr>\n      <td>FIB_PLUS</td>\n      <td>Multiple Fill in the Blank</td>\n    </tr>\n    <tr>\n      <td>JUMBLED_SENTENCE</td>\n      <td>Jumbled Sentence</td>\n    </tr>\n    <tr>\n      <td>QUIZ_BOWL</td>\n      <td>Quiz Bowl</td>\n    </tr>\n  </tbody>\n</table>"
+      "page_name": "Format Abbr."
    },
    "text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
  },
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.26-dev1"  # pragma: no cover
+__version__ = "0.10.26-dev2"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -265,6 +265,7 @@ def partition(
        elements = _partition_doc(
            filename=filename,
            file=file,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -274,6 +275,7 @@ def partition(
        elements = _partition_docx(
            filename=filename,
            file=file,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -283,6 +285,7 @@ def partition(
        elements = _partition_odt(
            filename=filename,
            file=file,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -331,6 +334,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -351,6 +355,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -361,6 +366,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -406,6 +412,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -416,6 +423,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -426,6 +434,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -442,6 +451,7 @@ def partition(
        elements = _partition_xlsx(
            filename=filename,
            file=file,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -451,6 +461,7 @@ def partition(
        elements = _partition_csv(
            filename=filename,
            file=file,
+            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@ -32,6 +32,7 @@ def partition_csv(
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    include_metadata: bool = True,
+    infer_table_structure: bool = True,
    languages: Optional[List[str]] = ["auto"],
    # NOTE (jennings) partition_csv generates a single TableElement
    # so detect_language_per_element is not included as a param
@ -51,6 +52,12 @@ def partition_csv(
        The last modified date for the document.
    include_metadata
        Determines whether or not metadata is included in the output.
+    infer_table_structure
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -74,11 +81,12 @@ def partition_csv(

    if include_metadata:
        metadata = ElementMetadata(
-            text_as_html=html_text,
            filename=metadata_filename or filename,
            last_modified=metadata_last_modified or last_modification_date,
            languages=languages,
        )
+        if infer_table_structure:
+            metadata.text_as_html = html_text
    else:
        metadata = ElementMetadata()

--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -88,6 +88,7 @@ def convert_and_partition_docx(
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    include_metadata: bool = True,
+    infer_table_structure: bool = True,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    languages: Optional[List[str]] = ["auto"],
@ -108,6 +109,12 @@ def convert_and_partition_docx(
    include_metadata
        Determines whether or not metadata is included in the metadata attribute on the elements in
        the output.
+    infer_table_structure
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -153,6 +160,7 @@ def convert_and_partition_docx(
            filename=docx_path,
            metadata_filename=metadata_filename,
            include_metadata=include_metadata,
+            infer_table_structure=infer_table_structure,
            metadata_last_modified=metadata_last_modified,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
@ -170,6 +178,7 @@ def partition_docx(
    metadata_filename: Optional[str] = None,
    include_page_breaks: bool = True,
    include_metadata: bool = True,  # used by decorator
+    infer_table_structure: bool = True,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,  # used by decorator
    languages: Optional[List[str]] = ["auto"],
@ -184,6 +193,12 @@ def partition_docx(
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
+    infer_table_structure
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
    metadata_filename
        The filename to use for the metadata. Relevant because partition_doc converts the document
        to .docx before partition. We want the original source filename in the metadata.
@ -205,6 +220,7 @@ def partition_docx(
        file,
        metadata_filename,
        include_page_breaks,
+        infer_table_structure,
        metadata_last_modified,
    )
    elements = apply_lang_metadata(
@ -246,12 +262,14 @@ class _DocxPartitioner:
        file: Optional[IO[bytes]],
        metadata_filename: Optional[str],
        include_page_breaks: bool,
+        infer_table_structure: bool,
        metadata_last_modified: Optional[str],
    ) -> None:
        self._filename = filename
        self._file = file
        self._metadata_filename = metadata_filename
        self._include_page_breaks = include_page_breaks
+        self._infer_table_structure = infer_table_structure
        self._metadata_last_modified = metadata_last_modified
        self._page_counter: int = 1

@ -262,6 +280,7 @@ class _DocxPartitioner:
        file: Optional[IO[bytes]] = None,
        metadata_filename: Optional[str] = None,
        include_page_breaks: bool = True,
+        infer_table_structure: bool = True,
        metadata_last_modified: Optional[str] = None,
    ) -> Iterator[Element]:
        """Partition MS Word documents (.docx format) into its document elements."""
@ -270,6 +289,7 @@ class _DocxPartitioner:
            file,
            metadata_filename,
            include_page_breaks,
+            infer_table_structure,
            metadata_last_modified,
        )._iter_document_elements()

@ -536,8 +556,9 @@ class _DocxPartitioner:
        """Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
        # -- at present, we always generate exactly one Table element, but we might want
        # -- to skip, for example, an empty table, or accommodate nested tables.
-
-        html_table = convert_ms_office_table_to_text(table, as_html=True)
+        html_table = None
+        if self._infer_table_structure:
+            html_table = convert_ms_office_table_to_text(table, as_html=True)
        text_table = convert_ms_office_table_to_text(table, as_html=False)
        emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)

--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@ -17,6 +17,7 @@ def partition_odt(
    filename: Optional[str] = None,
    file: Optional[BinaryIO] = None,
    include_metadata: bool = True,
+    infer_table_structure: bool = True,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,
@ -32,6 +33,12 @@ def partition_odt(
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
+    infer_table_structure
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
    metadata_last_modified
        The last modified date for the document.
    languages
@ -53,6 +60,7 @@ def partition_odt(
        source_format="odt",
        filename=filename,
        file=file,
+        infer_table_structure=infer_table_structure,
        metadata_filename=metadata_filename,
        metadata_last_modified=metadata_last_modified or last_modification_date,
        languages=languages,
--- a/unstructured/partition/ppt.py
+++ b/unstructured/partition/ppt.py
@ -22,6 +22,7 @@ def partition_ppt(
    file: Optional[IO[bytes]] = None,
    include_page_breaks: bool = False,
    include_metadata: bool = True,
+    infer_table_structure: bool = True,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,
@ -39,6 +40,12 @@ def partition_ppt(
        A file-like object using "rb" mode --> open(filename, "rb").
    include_page_breaks
        If True, includes a PageBreak element between slides
+    infer_table_structure
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
    metadata_last_modified
        The last modified date for the document.
    languages
@ -82,6 +89,7 @@ def partition_ppt(
        pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
        elements = partition_pptx(
            filename=pptx_filename,
+            infer_table_structure=infer_table_structure,
            metadata_filename=metadata_filename,
            metadata_last_modified=metadata_last_modified or last_modification_date,
            languages=languages,
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@ -56,6 +56,7 @@ def partition_pptx(
    include_metadata: bool = True,
    metadata_last_modified: Optional[str] = None,
    include_slide_notes: bool = False,
+    infer_table_structure: bool = True,
    chunking_strategy: Optional[str] = None,
    languages: Optional[List[str]] = ["auto"],
    detect_language_per_element: bool = False,
@ -79,6 +80,12 @@ def partition_pptx(
        The last modified date for the document.
    include_slide_notes
        If True, includes the slide notes as element
+    infer_table_structure
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -104,6 +111,7 @@ def partition_pptx(
        source_file,
        include_page_breaks,
        include_slide_notes,
+        infer_table_structure,
        metadata_filename,
        metadata_last_modified,
    )
@ -126,12 +134,14 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
        # -- this object in tests and makes them less sensitive to signature changes.
        include_page_breaks: bool = True,
        include_slide_notes: bool = False,
+        infer_table_structure: bool = True,
        metadata_filename: Optional[str] = None,
        metadata_last_modified: Optional[str] = None,
    ) -> None:
        self._file = file
        self._include_page_breaks = include_page_breaks
        self._include_slide_notes = include_slide_notes
+        self._infer_table_structure = infer_table_structure
        self._metadata_filename = metadata_filename
        self._metadata_last_modified = metadata_last_modified
        self._page_counter = 0
@ -142,6 +152,7 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
        file: Union[str, IO[bytes]],
        include_page_breaks: bool,
        include_slide_notes: bool,
+        infer_table_structure: bool,
        metadata_filename: Optional[str],
        metadata_last_modified: Optional[str],
    ) -> Iterator[Element]:
@ -150,6 +161,7 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
            file,
            include_page_breaks,
            include_slide_notes,
+            infer_table_structure,
            metadata_filename,
            metadata_last_modified,
        )._iter_presentation_elements()
@ -319,7 +331,9 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
        text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip()
        if not text_table:
            return
-        html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
+        html_table = None
+        if self._infer_table_structure:
+            html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
        yield Table(
            text=text_table,
            metadata=self._table_metadata(html_table),
@ -348,7 +362,7 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
    def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
        """Orders the shapes on `slide` from top to bottom and left to right.

-        Returns the the title shape if it exists and the ordered shapes."""
+        Returns the title shape if it exists and the ordered shapes."""

        def iter_shapes(shapes: _BaseGroupShapes) -> Iterator[BaseShape]:
            for shape in shapes:
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@ -44,6 +44,7 @@ def partition_xlsx(
    file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
+    infer_table_structure: bool = True,
    languages: Optional[List[str]] = ["auto"],
    detect_language_per_element: bool = False,
    metadata_last_modified: Optional[str] = None,
@ -61,6 +62,12 @@ def partition_xlsx(
        A file-like object using "rb" mode --> open(filename, "rb").
    include_metadata
        Determines whether or not metadata is included in the output.
+    infer_table_structure
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
    languages
        User defined value for metadata.languages if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -71,7 +78,7 @@ def partition_xlsx(
    metadata_last_modified
        The day of the last modification
    include_header
-        Determines whether or not header info info is included in text and medatada.text_as_html
+        Determines whether or not header info is included in text and medatada.text_as_html
    """
    exactly_one(filename=filename, file=file)

@ -94,7 +101,11 @@ def partition_xlsx(
    for sheet_name, sheet in sheets.items():
        page_number += 1
        if not find_subtable:
-            html_text = sheet.to_html(index=False, header=include_header, na_rep="")
+            html_text = (
+                sheet.to_html(index=False, header=include_header, na_rep="")
+                if infer_table_structure
+                else None
+            )
            text = soupparser_fromstring(html_text).text_content()

            if include_metadata:
@ -158,7 +169,7 @@ def partition_xlsx(
                    text = soupparser_fromstring(html_text).text_content()
                    subtable = Table(text=text)
                    subtable.metadata = metadata
-                    subtable.metadata.text_as_html = html_text
+                    subtable.metadata.text_as_html = html_text if infer_table_structure else None
                    elements.append(subtable)

                if front_non_consecutive is not None and last_non_consecutive is not None: