chore: fix infer_table bug (#1833)

Carrying `skip_infer_table_types` to `infer_table_structure` in partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a `text_as_html` field. Note: I've continued to exclude this var from partitioners that go through html flow, I think if we've already got the html it doesn't make sense to carry the infer variable along, since we're not 'infer-ing' the html table in these cases. TODO: ✅ add unit tests --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
2025-10-01 03:13:20 +00:00 · 2023-10-23 17:11:53 -07:00 · 2023-10-23 17:11:53 -07:00 · 0584e1d031
commit 0584e1d031
parent 6707cab250
20 changed files with 246 additions and 38 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.10.26-dev1
+## 0.10.26-dev2
 ### Enhancements
@ -10,6 +10,8 @@
 ### Fixes
 * **Fix a bug on Table partitioning** Previously the `skip_infer_table_types` variable used in partition was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable in partition to pass the filetype you want to exclude `text_as_html` metadata field for, or the `infer_table_structure` boolean variable on the file specific partitioning function.
 ## 0.10.25
 ### Enhancements
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@ -460,7 +460,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens
 Table Extraction for other filetypes
 ------------------------------------
-We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
+We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs, Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction for Images and PDFs only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
 .. tabs::
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@ -28,7 +28,7 @@ def test_it_splits_a_large_section_into_multiple_chunks():
        Title("Introduction"),
        Text(
            "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
-            " porta volutpat."
+            " porta volutpat.",
        ),
    ]
--- a/test_unstructured/partition/csv/test_csv.py
+++ b/test_unstructured/partition/csv/test_csv.py
@ -35,6 +35,24 @@ def test_partition_csv_from_filename(filename, expected_text, expected_table):
    assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
    "infer_table_structure",
    [
        True,
        False,
    ],
 )
 def test_partition_csv_from_filename_infer_table_structure(infer_table_structure):
    f_path = "example-docs/stanley-cups.csv"
    elements = partition_csv(filename=f_path, infer_table_structure=infer_table_structure)
    table_element_has_text_as_html_field = (
        hasattr(elements[0].metadata, "text_as_html")
        and elements[0].metadata.text_as_html is not None
    )
    assert table_element_has_text_as_html_field == infer_table_structure
 def test_partition_csv_from_filename_with_metadata_filename(
    filename="example-docs/stanley-cups.csv",
 ):
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@ -74,6 +74,25 @@ def test_partition_docx_from_file(mock_document, expected_elements, tmpdir):
        assert element.metadata.filename is None
@pytest.mark.parametrize(
    "infer_table_structure",
    [
        True,
        False,
    ],
 )
 def test_partition_docx_infer_table_structure(infer_table_structure):
    elements = partition_docx(
        filename="example-docs/fake_table.docx",
        infer_table_structure=infer_table_structure,
    )
    table_element_has_text_as_html_field = (
        hasattr(elements[0].metadata, "text_as_html")
        and elements[0].metadata.text_as_html is not None
    )
    assert table_element_has_text_as_html_field == infer_table_structure
 def test_partition_docx_from_file_with_metadata_filename(mock_document, expected_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_document.save(filename)
@ -265,6 +284,7 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: List[Dic
        None,
        None,
        False,
        True,
        None,
    )
    paragraph = partitioner._document.paragraphs[1]
@ -289,6 +309,7 @@ def test_iter_table_emphasis(expected_emphasized_texts: List[Dict[str, str]]):
        None,
        None,
        False,
        True,
        None,
    )
    table = partitioner._document.tables[0]
@ -305,6 +326,7 @@ def test_table_emphasis(
        None,
        None,
        False,
        True,
        None,
    )
    table = partitioner._document.tables[0]
@ -350,7 +372,14 @@ def test_partition_docx_with_json(mock_document, tmpdir):
 def test_parse_category_depth_by_style():
-    partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None)
+    partitioner = _DocxPartitioner(
        "example-docs/category-level.docx",
        None,
        None,
        False,
        True,
        None,
    )
    # Category depths are 0-indexed and relative to the category type
    # Title, list item, bullet, narrative text, etc.
@ -381,7 +410,7 @@ def test_parse_category_depth_by_style():
 def test_parse_category_depth_by_style_name():
-    partitioner = _DocxPartitioner(None, None, None, False, None)
+    partitioner = _DocxPartitioner(None, None, None, False, True, None)
    test_cases = [
        (0, "Heading 1"),
@ -406,7 +435,7 @@ def test_parse_category_depth_by_style_name():
 def test_parse_category_depth_by_style_ilvl():
-    partitioner = _DocxPartitioner(None, None, None, False, None)
+    partitioner = _DocxPartitioner(None, None, None, False, True, None)
    assert partitioner._parse_category_depth_by_style_ilvl() == 0
--- a/test_unstructured/partition/odt/test_odt.py
+++ b/test_unstructured/partition/odt/test_odt.py
@ -1,6 +1,8 @@
 import os
 import pathlib
 import pytest
 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 from unstructured.chunking.title import chunk_by_title
 from unstructured.documents.elements import Table, TableChunk, Title
@ -54,6 +56,24 @@ def test_partition_odt_from_file():
    ]
@pytest.mark.parametrize(
    "infer_table_structure",
    [
        True,
        False,
    ],
 )
 def test_partition_odt_infer_table_structure(infer_table_structure):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
    with open(filename, "rb") as f:
        elements = partition_odt(file=f, infer_table_structure=infer_table_structure)
    table_element_has_text_as_html_field = (
        hasattr(elements[1].metadata, "text_as_html")
        and elements[1].metadata.text_as_html is not None
    )
    assert table_element_has_text_as_html_field == infer_table_structure
 def test_partition_odt_from_file_with_metadata_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
    with open(filename, "rb") as f:
--- a/test_unstructured/partition/pptx/test_pptx.py
+++ b/test_unstructured/partition/pptx/test_pptx.py
@ -262,6 +262,26 @@ def test_partition_pptx_grabs_tables():
    assert elements[1].metadata.filename == "fake-power-point-table.pptx"
@pytest.mark.parametrize(
    "infer_table_structure",
    [
        True,
        False,
    ],
 )
 def test_partition_pptx_infer_table_structure(infer_table_structure):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
    elements = cast(
        Sequence[Text],
        partition_pptx(filename=filename, infer_table_structure=infer_table_structure),
    )
    table_element_has_text_as_html_field = (
        hasattr(elements[1].metadata, "text_as_html")
        and elements[1].metadata.text_as_html is not None
    )
    assert table_element_has_text_as_html_field == infer_table_structure
 def test_partition_pptx_malformed():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
    elements = cast(Sequence[Text], partition_pptx(filename=filename))
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -713,7 +713,7 @@ EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsh
 def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
-    elements = partition(filename=filename, include_header=False)
+    elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
    assert sum(isinstance(element, Table) for element in elements) == 2
    assert sum(isinstance(element, Title) for element in elements) == 2
@ -726,9 +726,36 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
    assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
@pytest.mark.parametrize(
    ("skip_infer_table_types", "filename", "has_text_as_html_field"),
    [
        (["xlsx"], "stanley-cups.xlsx", False),
        ([], "stanley-cups.xlsx", True),
        (["odt"], "fake.odt", False),
        ([], "fake.odt", True),
    ],
 )
 def test_auto_partition_respects_skip_infer_table_types(
    skip_infer_table_types, filename, has_text_as_html_field
 ):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
    with open(filename, "rb") as f:
        table_elements = [
            e
            for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
            if isinstance(e, Table)
        ]
        for table_element in table_elements:
            table_element_has_text_as_html_field = (
                hasattr(table_element.metadata, "text_as_html")
                and table_element.metadata.text_as_html is not None
            )
        assert table_element_has_text_as_html_field == has_text_as_html_field
 def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
    with open(filename, "rb") as f:
-        elements = partition(file=f, include_header=False)
+        elements = partition(file=f, include_header=False, skip_infer_table_types=[])
    assert sum(isinstance(element, Table) for element in elements) == 2
    assert sum(isinstance(element, Title) for element in elements) == 2
@ -834,7 +861,7 @@ EXPECTED_XLS_TABLE = (
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
-    elements = partition(filename=filename, include_header=False)
+    elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
    assert sum(isinstance(element, Table) for element in elements) == 2
    assert len(elements) == 18
--- a/test_unstructured/partition/xlsx/test_xlsx.py
+++ b/test_unstructured/partition/xlsx/test_xlsx.py
@ -51,6 +51,27 @@ def test_partition_xlsx_from_filename_with_metadata_filename(
    assert elements[0].metadata.filename == "test"
@pytest.mark.parametrize(
    "infer_table_structure",
    [
        True,
        False,
    ],
 )
 def test_partition_xlsx_infer_table_structure(
    infer_table_structure,
    filename="example-docs/stanley-cups.xlsx",
 ):
    elements = partition_xlsx(filename=filename, infer_table_structure=infer_table_structure)
    table_elements = [e for e in elements if isinstance(e, Table)]
    for table_element in table_elements:
        table_element_has_text_as_html_field = (
            hasattr(table_element.metadata, "text_as_html")
            and table_element.metadata.text_as_html is not None
        )
        assert table_element_has_text_as_html_field == infer_table_structure
 def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
    elements = partition_xlsx(filename=filename, include_header=True)
    assert sum(isinstance(element, Table) for element in elements) == 2
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
@ -107,8 +107,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Stanley Cups",
+      "page_name": "Stanley Cups"
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "Stanley Cups"
  },
@ -220,8 +219,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Stanley Cups",
+      "page_name": "Stanley Cups"
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
  },
@ -333,8 +331,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Stanley Cups Since 67",
+      "page_name": "Stanley Cups Since 67"
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "Stanley Cups Since 67"
  },
@ -446,8 +443,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Stanley Cups Since 67",
+      "page_name": "Stanley Cups Since 67"
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
  }
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
@ -18,8 +18,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Stanley Cups",
+      "page_name": "Stanley Cups"
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "Stanley Cups"
  },
@ -42,8 +41,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Stanley Cups",
+      "page_name": "Stanley Cups"
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
  },
@ -66,8 +64,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Stanley Cups Since 67",
+      "page_name": "Stanley Cups Since 67"
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "Stanley Cups Since 67"
  },
@ -90,8 +87,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Stanley Cups Since 67",
+      "page_name": "Stanley Cups Since 67"
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
  }
--- a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
+++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
@ -18,8 +18,7 @@
        "eng"
      ],
      "page_number": 1,
-      "page_name": "Example Test",
+      "page_name": "Example Test"
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>MC</td>\n      <td>What is 2+2?</td>\n      <td>4</td>\n      <td>correct</td>\n      <td>3</td>\n      <td>incorrect</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>MA</td>\n      <td>What C datatypes are 8 bits? (assume i386)</td>\n      <td>int</td>\n      <td></td>\n      <td>float</td>\n      <td></td>\n      <td>double</td>\n      <td></td>\n      <td>char</td>\n    </tr>\n    <tr>\n      <td>TF</td>\n      <td>Bagpipes are awesome.</td>\n      <td>true</td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>ESS</td>\n      <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>ORD</td>\n      <td>Rank the following in their order of operation.</td>\n      <td>Parentheses</td>\n      <td>Exponents</td>\n      <td>Division</td>\n      <td>Addition</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>FIB</td>\n      <td>The student activities fee is</td>\n      <td>95</td>\n      <td>dollars for students enrolled in</td>\n      <td>19</td>\n      <td>units or more,</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>MAT</td>\n      <td>Match the lower-case greek letter with its capital form.</td>\n      <td>λ</td>\n      <td>Λ</td>\n      <td>α</td>\n      <td>γ</td>\n      <td>Γ</td>\n      <td>φ</td>\n      <td>Φ</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
  },
@ -88,8 +87,7 @@
        "eng"
      ],
      "page_number": 2,
-      "page_name": "Format Abbr.",
+      "page_name": "Format Abbr."
      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Abbreviation</td>\n      <td>Question Type</td>\n    </tr>\n    <tr>\n      <td>MC</td>\n      <td>Multiple Choice</td>\n    </tr>\n    <tr>\n      <td>MA</td>\n      <td>Multiple Answer</td>\n    </tr>\n    <tr>\n      <td>TF</td>\n      <td>True/False</td>\n    </tr>\n    <tr>\n      <td>ESS</td>\n      <td>Essay</td>\n    </tr>\n    <tr>\n      <td>ORD</td>\n      <td>Ordering</td>\n    </tr>\n    <tr>\n      <td>MAT</td>\n      <td>Matching</td>\n    </tr>\n    <tr>\n      <td>FIB</td>\n      <td>Fill in the Blank</td>\n    </tr>\n    <tr>\n      <td>FIL</td>\n      <td>File response</td>\n    </tr>\n    <tr>\n      <td>NUM</td>\n      <td>Numeric Response</td>\n    </tr>\n    <tr>\n      <td>SR</td>\n      <td>Short response</td>\n    </tr>\n    <tr>\n      <td>OP</td>\n      <td>Opinion</td>\n    </tr>\n    <tr>\n      <td>FIB_PLUS</td>\n      <td>Multiple Fill in the Blank</td>\n    </tr>\n    <tr>\n      <td>JUMBLED_SENTENCE</td>\n      <td>Jumbled Sentence</td>\n    </tr>\n    <tr>\n      <td>QUIZ_BOWL</td>\n      <td>Quiz Bowl</td>\n    </tr>\n  </tbody>\n</table>"
    },
    "text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
  },
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.26-dev1"  # pragma: no cover
+__version__ = "0.10.26-dev2"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -265,6 +265,7 @@ def partition(
        elements = _partition_doc(
            filename=filename,
            file=file,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -274,6 +275,7 @@ def partition(
        elements = _partition_docx(
            filename=filename,
            file=file,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -283,6 +285,7 @@ def partition(
        elements = _partition_odt(
            filename=filename,
            file=file,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -331,6 +334,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -351,6 +355,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -361,6 +366,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -406,6 +412,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -416,6 +423,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -426,6 +434,7 @@ def partition(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -442,6 +451,7 @@ def partition(
        elements = _partition_xlsx(
            filename=filename,
            file=file,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
@ -451,6 +461,7 @@ def partition(
        elements = _partition_csv(
            filename=filename,
            file=file,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            **kwargs,
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@ -32,6 +32,7 @@ def partition_csv(
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    include_metadata: bool = True,
    infer_table_structure: bool = True,
    languages: Optional[List[str]] = ["auto"],
    # NOTE (jennings) partition_csv generates a single TableElement
    # so detect_language_per_element is not included as a param
@ -51,6 +52,12 @@ def partition_csv(
        The last modified date for the document.
    include_metadata
        Determines whether or not metadata is included in the output.
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -74,11 +81,12 @@ def partition_csv(
    if include_metadata:
        metadata = ElementMetadata(
            text_as_html=html_text,
            filename=metadata_filename or filename,
            last_modified=metadata_last_modified or last_modification_date,
            languages=languages,
        )
        if infer_table_structure:
            metadata.text_as_html = html_text
    else:
        metadata = ElementMetadata()
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -88,6 +88,7 @@ def convert_and_partition_docx(
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    include_metadata: bool = True,
    infer_table_structure: bool = True,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    languages: Optional[List[str]] = ["auto"],
@ -108,6 +109,12 @@ def convert_and_partition_docx(
    include_metadata
        Determines whether or not metadata is included in the metadata attribute on the elements in
        the output.
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -153,6 +160,7 @@ def convert_and_partition_docx(
            filename=docx_path,
            metadata_filename=metadata_filename,
            include_metadata=include_metadata,
            infer_table_structure=infer_table_structure,
            metadata_last_modified=metadata_last_modified,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
@ -170,6 +178,7 @@ def partition_docx(
    metadata_filename: Optional[str] = None,
    include_page_breaks: bool = True,
    include_metadata: bool = True,  # used by decorator
    infer_table_structure: bool = True,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,  # used by decorator
    languages: Optional[List[str]] = ["auto"],
@ -184,6 +193,12 @@ def partition_docx(
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    metadata_filename
        The filename to use for the metadata. Relevant because partition_doc converts the document
        to .docx before partition. We want the original source filename in the metadata.
@ -205,6 +220,7 @@ def partition_docx(
        file,
        metadata_filename,
        include_page_breaks,
        infer_table_structure,
        metadata_last_modified,
    )
    elements = apply_lang_metadata(
@ -246,12 +262,14 @@ class _DocxPartitioner:
        file: Optional[IO[bytes]],
        metadata_filename: Optional[str],
        include_page_breaks: bool,
        infer_table_structure: bool,
        metadata_last_modified: Optional[str],
    ) -> None:
        self._filename = filename
        self._file = file
        self._metadata_filename = metadata_filename
        self._include_page_breaks = include_page_breaks
        self._infer_table_structure = infer_table_structure
        self._metadata_last_modified = metadata_last_modified
        self._page_counter: int = 1
@ -262,6 +280,7 @@ class _DocxPartitioner:
        file: Optional[IO[bytes]] = None,
        metadata_filename: Optional[str] = None,
        include_page_breaks: bool = True,
        infer_table_structure: bool = True,
        metadata_last_modified: Optional[str] = None,
    ) -> Iterator[Element]:
        """Partition MS Word documents (.docx format) into its document elements."""
@ -270,6 +289,7 @@ class _DocxPartitioner:
            file,
            metadata_filename,
            include_page_breaks,
            infer_table_structure,
            metadata_last_modified,
        )._iter_document_elements()
@ -536,7 +556,8 @@ class _DocxPartitioner:
        """Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
        # -- at present, we always generate exactly one Table element, but we might want
        # -- to skip, for example, an empty table, or accommodate nested tables.
-
+        html_table = None
        if self._infer_table_structure:
            html_table = convert_ms_office_table_to_text(table, as_html=True)
        text_table = convert_ms_office_table_to_text(table, as_html=False)
        emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@ -17,6 +17,7 @@ def partition_odt(
    filename: Optional[str] = None,
    file: Optional[BinaryIO] = None,
    include_metadata: bool = True,
    infer_table_structure: bool = True,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,
@ -32,6 +33,12 @@ def partition_odt(
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    metadata_last_modified
        The last modified date for the document.
    languages
@ -53,6 +60,7 @@ def partition_odt(
        source_format="odt",
        filename=filename,
        file=file,
        infer_table_structure=infer_table_structure,
        metadata_filename=metadata_filename,
        metadata_last_modified=metadata_last_modified or last_modification_date,
        languages=languages,
--- a/unstructured/partition/ppt.py
+++ b/unstructured/partition/ppt.py
@ -22,6 +22,7 @@ def partition_ppt(
    file: Optional[IO[bytes]] = None,
    include_page_breaks: bool = False,
    include_metadata: bool = True,
    infer_table_structure: bool = True,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,
@ -39,6 +40,12 @@ def partition_ppt(
        A file-like object using "rb" mode --> open(filename, "rb").
    include_page_breaks
        If True, includes a PageBreak element between slides
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    metadata_last_modified
        The last modified date for the document.
    languages
@ -82,6 +89,7 @@ def partition_ppt(
        pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
        elements = partition_pptx(
            filename=pptx_filename,
            infer_table_structure=infer_table_structure,
            metadata_filename=metadata_filename,
            metadata_last_modified=metadata_last_modified or last_modification_date,
            languages=languages,
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@ -56,6 +56,7 @@ def partition_pptx(
    include_metadata: bool = True,
    metadata_last_modified: Optional[str] = None,
    include_slide_notes: bool = False,
    infer_table_structure: bool = True,
    chunking_strategy: Optional[str] = None,
    languages: Optional[List[str]] = ["auto"],
    detect_language_per_element: bool = False,
@ -79,6 +80,12 @@ def partition_pptx(
        The last modified date for the document.
    include_slide_notes
        If True, includes the slide notes as element
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -104,6 +111,7 @@ def partition_pptx(
        source_file,
        include_page_breaks,
        include_slide_notes,
        infer_table_structure,
        metadata_filename,
        metadata_last_modified,
    )
@ -126,12 +134,14 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
        # -- this object in tests and makes them less sensitive to signature changes.
        include_page_breaks: bool = True,
        include_slide_notes: bool = False,
        infer_table_structure: bool = True,
        metadata_filename: Optional[str] = None,
        metadata_last_modified: Optional[str] = None,
    ) -> None:
        self._file = file
        self._include_page_breaks = include_page_breaks
        self._include_slide_notes = include_slide_notes
        self._infer_table_structure = infer_table_structure
        self._metadata_filename = metadata_filename
        self._metadata_last_modified = metadata_last_modified
        self._page_counter = 0
@ -142,6 +152,7 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
        file: Union[str, IO[bytes]],
        include_page_breaks: bool,
        include_slide_notes: bool,
        infer_table_structure: bool,
        metadata_filename: Optional[str],
        metadata_last_modified: Optional[str],
    ) -> Iterator[Element]:
@ -150,6 +161,7 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
            file,
            include_page_breaks,
            include_slide_notes,
            infer_table_structure,
            metadata_filename,
            metadata_last_modified,
        )._iter_presentation_elements()
@ -319,6 +331,8 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
        text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip()
        if not text_table:
            return
        html_table = None
        if self._infer_table_structure:
            html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
        yield Table(
            text=text_table,
@ -348,7 +362,7 @@ class _PptxPartitioner:  # pyright: ignore[reportUnusedClass]
    def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
        """Orders the shapes on `slide` from top to bottom and left to right.
-        Returns the the title shape if it exists and the ordered shapes."""
+        Returns the title shape if it exists and the ordered shapes."""
        def iter_shapes(shapes: _BaseGroupShapes) -> Iterator[BaseShape]:
            for shape in shapes:
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@ -44,6 +44,7 @@ def partition_xlsx(
    file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
    infer_table_structure: bool = True,
    languages: Optional[List[str]] = ["auto"],
    detect_language_per_element: bool = False,
    metadata_last_modified: Optional[str] = None,
@ -61,6 +62,12 @@ def partition_xlsx(
        A file-like object using "rb" mode --> open(filename, "rb").
    include_metadata
        Determines whether or not metadata is included in the output.
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        User defined value for metadata.languages if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -71,7 +78,7 @@ def partition_xlsx(
    metadata_last_modified
        The day of the last modification
    include_header
-        Determines whether or not header info info is included in text and medatada.text_as_html
+        Determines whether or not header info is included in text and medatada.text_as_html
    """
    exactly_one(filename=filename, file=file)
@ -94,7 +101,11 @@ def partition_xlsx(
    for sheet_name, sheet in sheets.items():
        page_number += 1
        if not find_subtable:
-            html_text = sheet.to_html(index=False, header=include_header, na_rep="")
+            html_text = (
                sheet.to_html(index=False, header=include_header, na_rep="")
                if infer_table_structure
                else None
            )
            text = soupparser_fromstring(html_text).text_content()
            if include_metadata:
@ -158,7 +169,7 @@ def partition_xlsx(
                    text = soupparser_fromstring(html_text).text_content()
                    subtable = Table(text=text)
                    subtable.metadata = metadata
-                    subtable.metadata.text_as_html = html_text
+                    subtable.metadata.text_as_html = html_text if infer_table_structure else None
                    elements.append(subtable)
                if front_non_consecutive is not None and last_non_consecutive is not None:
`@ -1 +1 @@`
	`__version__ = "0.10.26-dev1" # pragma: no cover`	`__version__ = "0.10.26-dev2" # pragma: no cover`