mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Table processing test for RTF (#1388)
This PR does two things: 1. Adds test case (and alters sample doc) for rtf and epub files with table 2. Adds `xls/x` file extension to `skip_infer_table_types` default list --------- Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
This commit is contained in:
parent
791adf459d
commit
7fd81dc7df
@ -1,4 +1,4 @@
|
||||
## 0.10.15-dev5
|
||||
## 0.10.15-dev6
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
* Updated HTML Partitioning to extract tables
|
||||
* Create and add `add_chunking_strategy` decorator to partition functions
|
||||
* Adds `languages` as an input parameter and marks `ocr_languages` kwarg for deprecation in pdf partitioning functions
|
||||
* Adds `xlsx` and `xls` to `skip_infer_table_types` default list in `partition`
|
||||
|
||||
### Features
|
||||
|
||||
|
@ -460,7 +460,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens
|
||||
Table Extraction for other filetypes
|
||||
------------------------------------
|
||||
|
||||
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs and Images, which are ``pdf``, ``jpg`` and ``png``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
|
||||
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
|
||||
|
||||
.. tabs::
|
||||
|
||||
|
@ -1,2 +1,10 @@
|
||||
{\rtf1\ansi\deff0
|
||||
{\pard \ql \f0 \sa180 \li0 \fi0 \outlinelevel0 \b \fs36 My First Heading\par}
|
||||
{\pard \ql \f0 \sa180 \li0 \fi0 My first paragraph.\par}
|
||||
{\pard \sa180 \li0 \fi0 \b Table Example:\par}
|
||||
{\trowd\cellx3000\cellx6000
|
||||
\pard\intbl\qc\fs20 Column 1\cell Column 2\cell\row
|
||||
\pard\intbl\qc\fs20 Row 1, Cell 1\cell Row 1, Cell 2\cell\row
|
||||
\pard\intbl\qc\fs20 Row 2, Cell 1\cell Row 2, Cell 2\cell\row
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ import os
|
||||
import pathlib
|
||||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import Table
|
||||
from unstructured.partition.epub import partition_epub
|
||||
from unstructured.partition.json import partition_json
|
||||
from unstructured.staging.base import elements_to_json
|
||||
@ -34,6 +35,19 @@ def test_partition_epub_from_filename():
|
||||
assert all_sections == expected_sections
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_returns_table_in_elements():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
|
||||
elements = partition_epub(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[14].text_as_html is not None
|
||||
assert elements[14] == Table(
|
||||
text="Contents. \n List of Illustrations "
|
||||
"(In certain versions of this etext [in certain browsers]"
|
||||
"\nclicking on the image will bring up a larger version.) "
|
||||
"\n (etext transcriber's note)",
|
||||
)
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
|
||||
elements = partition_epub(filename=filename, metadata_filename="test")
|
||||
|
@ -3,7 +3,7 @@ import pathlib
|
||||
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Title
|
||||
from unstructured.documents.elements import Table, Title
|
||||
from unstructured.partition.json import partition_json
|
||||
from unstructured.partition.rtf import partition_rtf
|
||||
from unstructured.staging.base import elements_to_json
|
||||
@ -16,6 +16,10 @@ def test_partition_rtf_from_filename():
|
||||
elements = partition_rtf(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[0] == Title("My First Heading")
|
||||
assert elements[-1] == Table(
|
||||
text="Column 1 \n Column 2 \n Row 1, Cell 1 \n Row 1, "
|
||||
"Cell 2 \n Row 2, Cell 1 \n Row 2, Cell 2",
|
||||
)
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "fake-doc.rtf"
|
||||
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.10.15-dev5" # pragma: no cover
|
||||
__version__ = "0.10.15-dev6" # pragma: no cover
|
||||
|
@ -126,7 +126,7 @@ def partition(
|
||||
encoding: Optional[str] = None,
|
||||
paragraph_grouper: Optional[Callable[[str], str]] = None,
|
||||
headers: Dict[str, str] = {},
|
||||
skip_infer_table_types: List[str] = ["pdf", "jpg", "png"],
|
||||
skip_infer_table_types: List[str] = ["pdf", "jpg", "png", "xls", "xlsx"],
|
||||
ssl_verify: bool = True,
|
||||
ocr_languages: str = "eng",
|
||||
pdf_infer_table_structure: bool = False,
|
||||
|
Loading…
x
Reference in New Issue
Block a user