chore: fix infer_table bug (#1833)

Carrying `skip_infer_table_types` to `infer_table_structure` in
partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a
`text_as_html` field.

Note: I've continued to exclude this var from partitioners that go
through html flow, I think if we've already got the html it doesn't make
sense to carry the infer variable along, since we're not 'infer-ing' the
html table in these cases.


TODO:
  add unit tests

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
This commit is contained in:
Amanda Cameron 2023-10-23 17:11:53 -07:00 committed by GitHub
parent 6707cab250
commit 0584e1d031
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 246 additions and 38 deletions

View File

@ -1,4 +1,4 @@
## 0.10.26-dev1
## 0.10.26-dev2
### Enhancements
@ -10,6 +10,8 @@
### Fixes
* **Fix a bug on Table partitioning** Previously the `skip_infer_table_types` variable used in partition was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable in partition to pass the filetype you want to exclude `text_as_html` metadata field for, or the `infer_table_structure` boolean variable on the file specific partitioning function.
## 0.10.25
### Enhancements

View File

@ -460,7 +460,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens
Table Extraction for other filetypes
------------------------------------
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs, Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction for Images and PDFs only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
.. tabs::

View File

@ -28,7 +28,7 @@ def test_it_splits_a_large_section_into_multiple_chunks():
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
" porta volutpat."
" porta volutpat.",
),
]

View File

@ -35,6 +35,24 @@ def test_partition_csv_from_filename(filename, expected_text, expected_table):
assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
"infer_table_structure",
[
True,
False,
],
)
def test_partition_csv_from_filename_infer_table_structure(infer_table_structure):
f_path = "example-docs/stanley-cups.csv"
elements = partition_csv(filename=f_path, infer_table_structure=infer_table_structure)
table_element_has_text_as_html_field = (
hasattr(elements[0].metadata, "text_as_html")
and elements[0].metadata.text_as_html is not None
)
assert table_element_has_text_as_html_field == infer_table_structure
def test_partition_csv_from_filename_with_metadata_filename(
filename="example-docs/stanley-cups.csv",
):

View File

@ -74,6 +74,25 @@ def test_partition_docx_from_file(mock_document, expected_elements, tmpdir):
assert element.metadata.filename is None
@pytest.mark.parametrize(
"infer_table_structure",
[
True,
False,
],
)
def test_partition_docx_infer_table_structure(infer_table_structure):
elements = partition_docx(
filename="example-docs/fake_table.docx",
infer_table_structure=infer_table_structure,
)
table_element_has_text_as_html_field = (
hasattr(elements[0].metadata, "text_as_html")
and elements[0].metadata.text_as_html is not None
)
assert table_element_has_text_as_html_field == infer_table_structure
def test_partition_docx_from_file_with_metadata_filename(mock_document, expected_elements, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
@ -265,6 +284,7 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: List[Dic
None,
None,
False,
True,
None,
)
paragraph = partitioner._document.paragraphs[1]
@ -289,6 +309,7 @@ def test_iter_table_emphasis(expected_emphasized_texts: List[Dict[str, str]]):
None,
None,
False,
True,
None,
)
table = partitioner._document.tables[0]
@ -305,6 +326,7 @@ def test_table_emphasis(
None,
None,
False,
True,
None,
)
table = partitioner._document.tables[0]
@ -350,7 +372,14 @@ def test_partition_docx_with_json(mock_document, tmpdir):
def test_parse_category_depth_by_style():
partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None)
partitioner = _DocxPartitioner(
"example-docs/category-level.docx",
None,
None,
False,
True,
None,
)
# Category depths are 0-indexed and relative to the category type
# Title, list item, bullet, narrative text, etc.
@ -381,7 +410,7 @@ def test_parse_category_depth_by_style():
def test_parse_category_depth_by_style_name():
partitioner = _DocxPartitioner(None, None, None, False, None)
partitioner = _DocxPartitioner(None, None, None, False, True, None)
test_cases = [
(0, "Heading 1"),
@ -406,7 +435,7 @@ def test_parse_category_depth_by_style_name():
def test_parse_category_depth_by_style_ilvl():
partitioner = _DocxPartitioner(None, None, None, False, None)
partitioner = _DocxPartitioner(None, None, None, False, True, None)
assert partitioner._parse_category_depth_by_style_ilvl() == 0

View File

@ -1,6 +1,8 @@
import os
import pathlib
import pytest
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Table, TableChunk, Title
@ -54,6 +56,24 @@ def test_partition_odt_from_file():
]
@pytest.mark.parametrize(
"infer_table_structure",
[
True,
False,
],
)
def test_partition_odt_infer_table_structure(infer_table_structure):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
with open(filename, "rb") as f:
elements = partition_odt(file=f, infer_table_structure=infer_table_structure)
table_element_has_text_as_html_field = (
hasattr(elements[1].metadata, "text_as_html")
and elements[1].metadata.text_as_html is not None
)
assert table_element_has_text_as_html_field == infer_table_structure
def test_partition_odt_from_file_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
with open(filename, "rb") as f:

View File

@ -262,6 +262,26 @@ def test_partition_pptx_grabs_tables():
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
@pytest.mark.parametrize(
"infer_table_structure",
[
True,
False,
],
)
def test_partition_pptx_infer_table_structure(infer_table_structure):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
elements = cast(
Sequence[Text],
partition_pptx(filename=filename, infer_table_structure=infer_table_structure),
)
table_element_has_text_as_html_field = (
hasattr(elements[1].metadata, "text_as_html")
and elements[1].metadata.text_as_html is not None
)
assert table_element_has_text_as_html_field == infer_table_structure
def test_partition_pptx_malformed():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
elements = cast(Sequence[Text], partition_pptx(filename=filename))

View File

@ -713,7 +713,7 @@ EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsh
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition(filename=filename, include_header=False)
elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
assert sum(isinstance(element, Table) for element in elements) == 2
assert sum(isinstance(element, Title) for element in elements) == 2
@ -726,9 +726,36 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
@pytest.mark.parametrize(
("skip_infer_table_types", "filename", "has_text_as_html_field"),
[
(["xlsx"], "stanley-cups.xlsx", False),
([], "stanley-cups.xlsx", True),
(["odt"], "fake.odt", False),
([], "fake.odt", True),
],
)
def test_auto_partition_respects_skip_infer_table_types(
skip_infer_table_types, filename, has_text_as_html_field
):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
with open(filename, "rb") as f:
table_elements = [
e
for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
if isinstance(e, Table)
]
for table_element in table_elements:
table_element_has_text_as_html_field = (
hasattr(table_element.metadata, "text_as_html")
and table_element.metadata.text_as_html is not None
)
assert table_element_has_text_as_html_field == has_text_as_html_field
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition(file=f, include_header=False)
elements = partition(file=f, include_header=False, skip_infer_table_types=[])
assert sum(isinstance(element, Table) for element in elements) == 2
assert sum(isinstance(element, Title) for element in elements) == 2
@ -834,7 +861,7 @@ EXPECTED_XLS_TABLE = (
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
elements = partition(filename=filename, include_header=False)
elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 18

View File

@ -51,6 +51,27 @@ def test_partition_xlsx_from_filename_with_metadata_filename(
assert elements[0].metadata.filename == "test"
@pytest.mark.parametrize(
"infer_table_structure",
[
True,
False,
],
)
def test_partition_xlsx_infer_table_structure(
infer_table_structure,
filename="example-docs/stanley-cups.xlsx",
):
elements = partition_xlsx(filename=filename, infer_table_structure=infer_table_structure)
table_elements = [e for e in elements if isinstance(e, Table)]
for table_element in table_elements:
table_element_has_text_as_html_field = (
hasattr(table_element.metadata, "text_as_html")
and table_element.metadata.text_as_html is not None
)
assert table_element_has_text_as_html_field == infer_table_structure
def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_header=True)
assert sum(isinstance(element, Table) for element in elements) == 2

View File

@ -107,8 +107,7 @@
"eng"
],
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Stanley Cups"
},
"text": "Stanley Cups"
},
@ -220,8 +219,7 @@
"eng"
],
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Stanley Cups"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
},
@ -333,8 +331,7 @@
"eng"
],
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Stanley Cups Since 67"
},
"text": "Stanley Cups Since 67"
},
@ -446,8 +443,7 @@
"eng"
],
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Stanley Cups Since 67"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
}

View File

@ -18,8 +18,7 @@
"eng"
],
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Stanley Cups"
},
"text": "Stanley Cups"
},
@ -42,8 +41,7 @@
"eng"
],
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Stanley Cups"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
},
@ -66,8 +64,7 @@
"eng"
],
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Stanley Cups Since 67"
},
"text": "Stanley Cups Since 67"
},
@ -90,8 +87,7 @@
"eng"
],
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Stanley Cups Since 67"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
}

View File

@ -18,8 +18,7 @@
"eng"
],
"page_number": 1,
"page_name": "Example Test",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>MC</td>\n <td>What is 2+2?</td>\n <td>4</td>\n <td>correct</td>\n <td>3</td>\n <td>incorrect</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Example Test"
},
"text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
},
@ -88,8 +87,7 @@
"eng"
],
"page_number": 2,
"page_name": "Format Abbr.",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
"page_name": "Format Abbr."
},
"text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
},

View File

@ -1 +1 @@
__version__ = "0.10.26-dev1" # pragma: no cover
__version__ = "0.10.26-dev2" # pragma: no cover

View File

@ -265,6 +265,7 @@ def partition(
elements = _partition_doc(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -274,6 +275,7 @@ def partition(
elements = _partition_docx(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -283,6 +285,7 @@ def partition(
elements = _partition_odt(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -331,6 +334,7 @@ def partition(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -351,6 +355,7 @@ def partition(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -361,6 +366,7 @@ def partition(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -406,6 +412,7 @@ def partition(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -416,6 +423,7 @@ def partition(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -426,6 +434,7 @@ def partition(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -442,6 +451,7 @@ def partition(
elements = _partition_xlsx(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,
@ -451,6 +461,7 @@ def partition(
elements = _partition_csv(
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
**kwargs,

View File

@ -32,6 +32,7 @@ def partition_csv(
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
include_metadata: bool = True,
infer_table_structure: bool = True,
languages: Optional[List[str]] = ["auto"],
# NOTE (jennings) partition_csv generates a single TableElement
# so detect_language_per_element is not included as a param
@ -51,6 +52,12 @@ def partition_csv(
The last modified date for the document.
include_metadata
Determines whether or not metadata is included in the output.
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -74,11 +81,12 @@ def partition_csv(
if include_metadata:
metadata = ElementMetadata(
text_as_html=html_text,
filename=metadata_filename or filename,
last_modified=metadata_last_modified or last_modification_date,
languages=languages,
)
if infer_table_structure:
metadata.text_as_html = html_text
else:
metadata = ElementMetadata()

View File

@ -88,6 +88,7 @@ def convert_and_partition_docx(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
include_metadata: bool = True,
infer_table_structure: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
languages: Optional[List[str]] = ["auto"],
@ -108,6 +109,12 @@ def convert_and_partition_docx(
include_metadata
Determines whether or not metadata is included in the metadata attribute on the elements in
the output.
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -153,6 +160,7 @@ def convert_and_partition_docx(
filename=docx_path,
metadata_filename=metadata_filename,
include_metadata=include_metadata,
infer_table_structure=infer_table_structure,
metadata_last_modified=metadata_last_modified,
languages=languages,
detect_language_per_element=detect_language_per_element,
@ -170,6 +178,7 @@ def partition_docx(
metadata_filename: Optional[str] = None,
include_page_breaks: bool = True,
include_metadata: bool = True, # used by decorator
infer_table_structure: bool = True,
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None, # used by decorator
languages: Optional[List[str]] = ["auto"],
@ -184,6 +193,12 @@ def partition_docx(
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
metadata_filename
The filename to use for the metadata. Relevant because partition_doc converts the document
to .docx before partition. We want the original source filename in the metadata.
@ -205,6 +220,7 @@ def partition_docx(
file,
metadata_filename,
include_page_breaks,
infer_table_structure,
metadata_last_modified,
)
elements = apply_lang_metadata(
@ -246,12 +262,14 @@ class _DocxPartitioner:
file: Optional[IO[bytes]],
metadata_filename: Optional[str],
include_page_breaks: bool,
infer_table_structure: bool,
metadata_last_modified: Optional[str],
) -> None:
self._filename = filename
self._file = file
self._metadata_filename = metadata_filename
self._include_page_breaks = include_page_breaks
self._infer_table_structure = infer_table_structure
self._metadata_last_modified = metadata_last_modified
self._page_counter: int = 1
@ -262,6 +280,7 @@ class _DocxPartitioner:
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
include_page_breaks: bool = True,
infer_table_structure: bool = True,
metadata_last_modified: Optional[str] = None,
) -> Iterator[Element]:
"""Partition MS Word documents (.docx format) into its document elements."""
@ -270,6 +289,7 @@ class _DocxPartitioner:
file,
metadata_filename,
include_page_breaks,
infer_table_structure,
metadata_last_modified,
)._iter_document_elements()
@ -536,8 +556,9 @@ class _DocxPartitioner:
"""Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
# -- at present, we always generate exactly one Table element, but we might want
# -- to skip, for example, an empty table, or accommodate nested tables.
html_table = convert_ms_office_table_to_text(table, as_html=True)
html_table = None
if self._infer_table_structure:
html_table = convert_ms_office_table_to_text(table, as_html=True)
text_table = convert_ms_office_table_to_text(table, as_html=False)
emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)

View File

@ -17,6 +17,7 @@ def partition_odt(
filename: Optional[str] = None,
file: Optional[BinaryIO] = None,
include_metadata: bool = True,
infer_table_structure: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None,
@ -32,6 +33,12 @@ def partition_odt(
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
metadata_last_modified
The last modified date for the document.
languages
@ -53,6 +60,7 @@ def partition_odt(
source_format="odt",
filename=filename,
file=file,
infer_table_structure=infer_table_structure,
metadata_filename=metadata_filename,
metadata_last_modified=metadata_last_modified or last_modification_date,
languages=languages,

View File

@ -22,6 +22,7 @@ def partition_ppt(
file: Optional[IO[bytes]] = None,
include_page_breaks: bool = False,
include_metadata: bool = True,
infer_table_structure: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None,
@ -39,6 +40,12 @@ def partition_ppt(
A file-like object using "rb" mode --> open(filename, "rb").
include_page_breaks
If True, includes a PageBreak element between slides
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
metadata_last_modified
The last modified date for the document.
languages
@ -82,6 +89,7 @@ def partition_ppt(
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
elements = partition_pptx(
filename=pptx_filename,
infer_table_structure=infer_table_structure,
metadata_filename=metadata_filename,
metadata_last_modified=metadata_last_modified or last_modification_date,
languages=languages,

View File

@ -56,6 +56,7 @@ def partition_pptx(
include_metadata: bool = True,
metadata_last_modified: Optional[str] = None,
include_slide_notes: bool = False,
infer_table_structure: bool = True,
chunking_strategy: Optional[str] = None,
languages: Optional[List[str]] = ["auto"],
detect_language_per_element: bool = False,
@ -79,6 +80,12 @@ def partition_pptx(
The last modified date for the document.
include_slide_notes
If True, includes the slide notes as element
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -104,6 +111,7 @@ def partition_pptx(
source_file,
include_page_breaks,
include_slide_notes,
infer_table_structure,
metadata_filename,
metadata_last_modified,
)
@ -126,12 +134,14 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
# -- this object in tests and makes them less sensitive to signature changes.
include_page_breaks: bool = True,
include_slide_notes: bool = False,
infer_table_structure: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
) -> None:
self._file = file
self._include_page_breaks = include_page_breaks
self._include_slide_notes = include_slide_notes
self._infer_table_structure = infer_table_structure
self._metadata_filename = metadata_filename
self._metadata_last_modified = metadata_last_modified
self._page_counter = 0
@ -142,6 +152,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
file: Union[str, IO[bytes]],
include_page_breaks: bool,
include_slide_notes: bool,
infer_table_structure: bool,
metadata_filename: Optional[str],
metadata_last_modified: Optional[str],
) -> Iterator[Element]:
@ -150,6 +161,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
file,
include_page_breaks,
include_slide_notes,
infer_table_structure,
metadata_filename,
metadata_last_modified,
)._iter_presentation_elements()
@ -319,7 +331,9 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip()
if not text_table:
return
html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
html_table = None
if self._infer_table_structure:
html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
yield Table(
text=text_table,
metadata=self._table_metadata(html_table),
@ -348,7 +362,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
"""Orders the shapes on `slide` from top to bottom and left to right.
Returns the the title shape if it exists and the ordered shapes."""
Returns the title shape if it exists and the ordered shapes."""
def iter_shapes(shapes: _BaseGroupShapes) -> Iterator[BaseShape]:
for shape in shapes:

View File

@ -44,6 +44,7 @@ def partition_xlsx(
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
infer_table_structure: bool = True,
languages: Optional[List[str]] = ["auto"],
detect_language_per_element: bool = False,
metadata_last_modified: Optional[str] = None,
@ -61,6 +62,12 @@ def partition_xlsx(
A file-like object using "rb" mode --> open(filename, "rb").
include_metadata
Determines whether or not metadata is included in the output.
infer_table_structure
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
languages
User defined value for metadata.languages if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
@ -71,7 +78,7 @@ def partition_xlsx(
metadata_last_modified
The day of the last modification
include_header
Determines whether or not header info info is included in text and medatada.text_as_html
Determines whether or not header info is included in text and medatada.text_as_html
"""
exactly_one(filename=filename, file=file)
@ -94,7 +101,11 @@ def partition_xlsx(
for sheet_name, sheet in sheets.items():
page_number += 1
if not find_subtable:
html_text = sheet.to_html(index=False, header=include_header, na_rep="")
html_text = (
sheet.to_html(index=False, header=include_header, na_rep="")
if infer_table_structure
else None
)
text = soupparser_fromstring(html_text).text_content()
if include_metadata:
@ -158,7 +169,7 @@ def partition_xlsx(
text = soupparser_fromstring(html_text).text_content()
subtable = Table(text=text)
subtable.metadata = metadata
subtable.metadata.text_as_html = html_text
subtable.metadata.text_as_html = html_text if infer_table_structure else None
elements.append(subtable)
if front_non_consecutive is not None and last_non_consecutive is not None: