Table processing test for RTF (#1388)

This PR does two things: 1. Adds test case (and alters sample doc) for rtf and epub files with table 2. Adds `xls/x` file extension to `skip_infer_table_types` default list --------- Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
2025-06-27 02:30:08 +00:00 · 2023-09-12 18:27:05 -07:00 · 2023-09-12 18:27:05 -07:00 · 7fd81dc7df
commit 7fd81dc7df
parent 791adf459d
7 changed files with 32 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.10.15-dev5
+## 0.10.15-dev6

 ### Enhancements

@ -6,6 +6,7 @@
 * Updated HTML Partitioning to extract tables
 * Create and add `add_chunking_strategy` decorator to partition functions
 * Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in pdf partitioning functions
+* Adds `xlsx` and `xls` to `skip_infer_table_types` default list in `partition`

 ### Features

--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@ -460,7 +460,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens
 Table Extraction for other filetypes
 ------------------------------------

-We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs and Images, which are ``pdf``, ``jpg`` and ``png``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
+We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:

 .. tabs::

--- a/example-docs/fake-doc.rtf
+++ b/example-docs/fake-doc.rtf
@ -1,2 +1,10 @@
+{\rtf1\ansi\deff0
 {\pard \ql \f0 \sa180 \li0 \fi0 \outlinelevel0 \b \fs36 My First Heading\par}
 {\pard \ql \f0 \sa180 \li0 \fi0 My first paragraph.\par}
+{\pard \sa180 \li0 \fi0 \b Table Example:\par}
+{\trowd\cellx3000\cellx6000
+\pard\intbl\qc\fs20 Column 1\cell Column 2\cell\row
+\pard\intbl\qc\fs20 Row 1, Cell 1\cell Row 1, Cell 2\cell\row
+\pard\intbl\qc\fs20 Row 2, Cell 1\cell Row 2, Cell 2\cell\row
+}
+}
--- a/test_unstructured/partition/epub/test_epub.py
+++ b/test_unstructured/partition/epub/test_epub.py
@ -2,6 +2,7 @@ import os
 import pathlib

 from unstructured.chunking.title import chunk_by_title
+from unstructured.documents.elements import Table
 from unstructured.partition.epub import partition_epub
 from unstructured.partition.json import partition_json
 from unstructured.staging.base import elements_to_json
@ -34,6 +35,19 @@ def test_partition_epub_from_filename():
    assert all_sections == expected_sections


+def test_partition_epub_from_filename_returns_table_in_elements():
+    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
+    elements = partition_epub(filename=filename)
+    assert len(elements) > 0
+    assert elements[14].text_as_html is not None
+    assert elements[14] == Table(
+        text="Contents. \n List of Illustrations   "
+        "(In certain versions of this etext [in certain browsers]"
+        "\nclicking on the image will bring up a larger version.) "
+        "\n (etext transcriber's note)",
+    )
+
+
 def test_partition_epub_from_filename_with_metadata_filename():
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
    elements = partition_epub(filename=filename, metadata_filename="test")
--- a/test_unstructured/partition/pypandoc/test_rtf.py
+++ b/test_unstructured/partition/pypandoc/test_rtf.py
@ -3,7 +3,7 @@ import pathlib

 from unstructured.chunking.title import chunk_by_title
 from unstructured.cleaners.core import clean_extra_whitespace
-from unstructured.documents.elements import Title
+from unstructured.documents.elements import Table, Title
 from unstructured.partition.json import partition_json
 from unstructured.partition.rtf import partition_rtf
 from unstructured.staging.base import elements_to_json
@ -16,6 +16,10 @@ def test_partition_rtf_from_filename():
    elements = partition_rtf(filename=filename)
    assert len(elements) > 0
    assert elements[0] == Title("My First Heading")
+    assert elements[-1] == Table(
+        text="Column 1 \n Column 2 \n Row 1, Cell 1 \n Row 1, "
+        "Cell 2 \n Row 2, Cell 1 \n Row 2, Cell 2",
+    )
    for element in elements:
        assert element.metadata.filename == "fake-doc.rtf"

--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.15-dev5"  # pragma: no cover
+__version__ = "0.10.15-dev6"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -126,7 +126,7 @@ def partition(
    encoding: Optional[str] = None,
    paragraph_grouper: Optional[Callable[[str], str]] = None,
    headers: Dict[str, str] = {},
-    skip_infer_table_types: List[str] = ["pdf", "jpg", "png"],
+    skip_infer_table_types: List[str] = ["pdf", "jpg", "png", "xls", "xlsx"],
    ssl_verify: bool = True,
    ocr_languages: str = "eng",
    pdf_infer_table_structure: bool = False,