Table processing test for RTF (#1388)

This PR does two things:
1. Adds test case (and alters sample doc) for rtf and epub files with
table
2. Adds `xls/x` file extension to `skip_infer_table_types` default list

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
This commit is contained in:
Amanda Cameron 2023-09-12 18:27:05 -07:00 committed by GitHub
parent 791adf459d
commit 7fd81dc7df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 32 additions and 5 deletions

View File

@ -1,4 +1,4 @@
## 0.10.15-dev5
## 0.10.15-dev6
### Enhancements
@ -6,6 +6,7 @@
* Updated HTML Partitioning to extract tables
* Create and add `add_chunking_strategy` decorator to partition functions
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in pdf partitioning functions
* Adds `xlsx` and `xls` to `skip_infer_table_types` default list in `partition`
### Features

View File

@ -460,7 +460,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens
Table Extraction for other filetypes
------------------------------------
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs and Images, which are ``pdf``, ``jpg`` and ``png``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
.. tabs::

View File

@ -1,2 +1,10 @@
{\rtf1\ansi\deff0
{\pard \ql \f0 \sa180 \li0 \fi0 \outlinelevel0 \b \fs36 My First Heading\par}
{\pard \ql \f0 \sa180 \li0 \fi0 My first paragraph.\par}
{\pard \sa180 \li0 \fi0 \b Table Example:\par}
{\trowd\cellx3000\cellx6000
\pard\intbl\qc\fs20 Column 1\cell Column 2\cell\row
\pard\intbl\qc\fs20 Row 1, Cell 1\cell Row 1, Cell 2\cell\row
\pard\intbl\qc\fs20 Row 2, Cell 1\cell Row 2, Cell 2\cell\row
}
}

View File

@ -2,6 +2,7 @@ import os
import pathlib
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Table
from unstructured.partition.epub import partition_epub
from unstructured.partition.json import partition_json
from unstructured.staging.base import elements_to_json
@ -34,6 +35,19 @@ def test_partition_epub_from_filename():
assert all_sections == expected_sections
def test_partition_epub_from_filename_returns_table_in_elements():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
elements = partition_epub(filename=filename)
assert len(elements) > 0
assert elements[14].text_as_html is not None
assert elements[14] == Table(
text="Contents. \n List of Illustrations "
"(In certain versions of this etext [in certain browsers]"
"\nclicking on the image will bring up a larger version.) "
"\n (etext transcriber's note)",
)
def test_partition_epub_from_filename_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
elements = partition_epub(filename=filename, metadata_filename="test")

View File

@ -3,7 +3,7 @@ import pathlib
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Title
from unstructured.documents.elements import Table, Title
from unstructured.partition.json import partition_json
from unstructured.partition.rtf import partition_rtf
from unstructured.staging.base import elements_to_json
@ -16,6 +16,10 @@ def test_partition_rtf_from_filename():
elements = partition_rtf(filename=filename)
assert len(elements) > 0
assert elements[0] == Title("My First Heading")
assert elements[-1] == Table(
text="Column 1 \n Column 2 \n Row 1, Cell 1 \n Row 1, "
"Cell 2 \n Row 2, Cell 1 \n Row 2, Cell 2",
)
for element in elements:
assert element.metadata.filename == "fake-doc.rtf"

View File

@ -1 +1 @@
__version__ = "0.10.15-dev5" # pragma: no cover
__version__ = "0.10.15-dev6" # pragma: no cover

View File

@ -126,7 +126,7 @@ def partition(
encoding: Optional[str] = None,
paragraph_grouper: Optional[Callable[[str], str]] = None,
headers: Dict[str, str] = {},
skip_infer_table_types: List[str] = ["pdf", "jpg", "png"],
skip_infer_table_types: List[str] = ["pdf", "jpg", "png", "xls", "xlsx"],
ssl_verify: bool = True,
ocr_languages: str = "eng",
pdf_infer_table_structure: bool = False,