mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
chore: fix infer_table bug (#1833)
Carrying `skip_infer_table_types` to `infer_table_structure` in
partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a
`text_as_html` field.
Note: I've continued to exclude this var from partitioners that go
through html flow, I think if we've already got the html it doesn't make
sense to carry the infer variable along, since we're not 'infer-ing' the
html table in these cases.
TODO:
✅ add unit tests
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
This commit is contained in:
parent
6707cab250
commit
0584e1d031
@ -1,4 +1,4 @@
|
||||
## 0.10.26-dev1
|
||||
## 0.10.26-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fix a bug on Table partitioning** Previously the `skip_infer_table_types` variable used in partition was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable in partition to pass the filetype you want to exclude `text_as_html` metadata field for, or the `infer_table_structure` boolean variable on the file specific partitioning function.
|
||||
|
||||
## 0.10.25
|
||||
|
||||
### Enhancements
|
||||
|
@ -460,7 +460,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens
|
||||
Table Extraction for other filetypes
|
||||
------------------------------------
|
||||
|
||||
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
|
||||
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs, Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction for Images and PDFs only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
|
||||
|
||||
.. tabs::
|
||||
|
||||
|
@ -28,7 +28,7 @@ def test_it_splits_a_large_section_into_multiple_chunks():
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
|
||||
" porta volutpat."
|
||||
" porta volutpat.",
|
||||
),
|
||||
]
|
||||
|
||||
|
@ -35,6 +35,24 @@ def test_partition_csv_from_filename(filename, expected_text, expected_table):
|
||||
assert elements[0].metadata.filename == filename
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_table_structure",
|
||||
[
|
||||
True,
|
||||
False,
|
||||
],
|
||||
)
|
||||
def test_partition_csv_from_filename_infer_table_structure(infer_table_structure):
|
||||
f_path = "example-docs/stanley-cups.csv"
|
||||
elements = partition_csv(filename=f_path, infer_table_structure=infer_table_structure)
|
||||
|
||||
table_element_has_text_as_html_field = (
|
||||
hasattr(elements[0].metadata, "text_as_html")
|
||||
and elements[0].metadata.text_as_html is not None
|
||||
)
|
||||
assert table_element_has_text_as_html_field == infer_table_structure
|
||||
|
||||
|
||||
def test_partition_csv_from_filename_with_metadata_filename(
|
||||
filename="example-docs/stanley-cups.csv",
|
||||
):
|
||||
|
@ -74,6 +74,25 @@ def test_partition_docx_from_file(mock_document, expected_elements, tmpdir):
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_table_structure",
|
||||
[
|
||||
True,
|
||||
False,
|
||||
],
|
||||
)
|
||||
def test_partition_docx_infer_table_structure(infer_table_structure):
|
||||
elements = partition_docx(
|
||||
filename="example-docs/fake_table.docx",
|
||||
infer_table_structure=infer_table_structure,
|
||||
)
|
||||
table_element_has_text_as_html_field = (
|
||||
hasattr(elements[0].metadata, "text_as_html")
|
||||
and elements[0].metadata.text_as_html is not None
|
||||
)
|
||||
assert table_element_has_text_as_html_field == infer_table_structure
|
||||
|
||||
|
||||
def test_partition_docx_from_file_with_metadata_filename(mock_document, expected_elements, tmpdir):
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
@ -265,6 +284,7 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: List[Dic
|
||||
None,
|
||||
None,
|
||||
False,
|
||||
True,
|
||||
None,
|
||||
)
|
||||
paragraph = partitioner._document.paragraphs[1]
|
||||
@ -289,6 +309,7 @@ def test_iter_table_emphasis(expected_emphasized_texts: List[Dict[str, str]]):
|
||||
None,
|
||||
None,
|
||||
False,
|
||||
True,
|
||||
None,
|
||||
)
|
||||
table = partitioner._document.tables[0]
|
||||
@ -305,6 +326,7 @@ def test_table_emphasis(
|
||||
None,
|
||||
None,
|
||||
False,
|
||||
True,
|
||||
None,
|
||||
)
|
||||
table = partitioner._document.tables[0]
|
||||
@ -350,7 +372,14 @@ def test_partition_docx_with_json(mock_document, tmpdir):
|
||||
|
||||
|
||||
def test_parse_category_depth_by_style():
|
||||
partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None)
|
||||
partitioner = _DocxPartitioner(
|
||||
"example-docs/category-level.docx",
|
||||
None,
|
||||
None,
|
||||
False,
|
||||
True,
|
||||
None,
|
||||
)
|
||||
|
||||
# Category depths are 0-indexed and relative to the category type
|
||||
# Title, list item, bullet, narrative text, etc.
|
||||
@ -381,7 +410,7 @@ def test_parse_category_depth_by_style():
|
||||
|
||||
|
||||
def test_parse_category_depth_by_style_name():
|
||||
partitioner = _DocxPartitioner(None, None, None, False, None)
|
||||
partitioner = _DocxPartitioner(None, None, None, False, True, None)
|
||||
|
||||
test_cases = [
|
||||
(0, "Heading 1"),
|
||||
@ -406,7 +435,7 @@ def test_parse_category_depth_by_style_name():
|
||||
|
||||
|
||||
def test_parse_category_depth_by_style_ilvl():
|
||||
partitioner = _DocxPartitioner(None, None, None, False, None)
|
||||
partitioner = _DocxPartitioner(None, None, None, False, True, None)
|
||||
assert partitioner._parse_category_depth_by_style_ilvl() == 0
|
||||
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import Table, TableChunk, Title
|
||||
@ -54,6 +56,24 @@ def test_partition_odt_from_file():
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_table_structure",
|
||||
[
|
||||
True,
|
||||
False,
|
||||
],
|
||||
)
|
||||
def test_partition_odt_infer_table_structure(infer_table_structure):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_odt(file=f, infer_table_structure=infer_table_structure)
|
||||
table_element_has_text_as_html_field = (
|
||||
hasattr(elements[1].metadata, "text_as_html")
|
||||
and elements[1].metadata.text_as_html is not None
|
||||
)
|
||||
assert table_element_has_text_as_html_field == infer_table_structure
|
||||
|
||||
|
||||
def test_partition_odt_from_file_with_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
with open(filename, "rb") as f:
|
||||
|
@ -262,6 +262,26 @@ def test_partition_pptx_grabs_tables():
|
||||
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_table_structure",
|
||||
[
|
||||
True,
|
||||
False,
|
||||
],
|
||||
)
|
||||
def test_partition_pptx_infer_table_structure(infer_table_structure):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
|
||||
elements = cast(
|
||||
Sequence[Text],
|
||||
partition_pptx(filename=filename, infer_table_structure=infer_table_structure),
|
||||
)
|
||||
table_element_has_text_as_html_field = (
|
||||
hasattr(elements[1].metadata, "text_as_html")
|
||||
and elements[1].metadata.text_as_html is not None
|
||||
)
|
||||
assert table_element_has_text_as_html_field == infer_table_structure
|
||||
|
||||
|
||||
def test_partition_pptx_malformed():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||
elements = cast(Sequence[Text], partition_pptx(filename=filename))
|
||||
|
@ -713,7 +713,7 @@ EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsh
|
||||
|
||||
|
||||
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
||||
elements = partition(filename=filename, include_header=False)
|
||||
elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
|
||||
|
||||
assert sum(isinstance(element, Table) for element in elements) == 2
|
||||
assert sum(isinstance(element, Title) for element in elements) == 2
|
||||
@ -726,9 +726,36 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
|
||||
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("skip_infer_table_types", "filename", "has_text_as_html_field"),
|
||||
[
|
||||
(["xlsx"], "stanley-cups.xlsx", False),
|
||||
([], "stanley-cups.xlsx", True),
|
||||
(["odt"], "fake.odt", False),
|
||||
([], "fake.odt", True),
|
||||
],
|
||||
)
|
||||
def test_auto_partition_respects_skip_infer_table_types(
|
||||
skip_infer_table_types, filename, has_text_as_html_field
|
||||
):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
||||
with open(filename, "rb") as f:
|
||||
table_elements = [
|
||||
e
|
||||
for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
|
||||
if isinstance(e, Table)
|
||||
]
|
||||
for table_element in table_elements:
|
||||
table_element_has_text_as_html_field = (
|
||||
hasattr(table_element.metadata, "text_as_html")
|
||||
and table_element.metadata.text_as_html is not None
|
||||
)
|
||||
assert table_element_has_text_as_html_field == has_text_as_html_field
|
||||
|
||||
|
||||
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f, include_header=False)
|
||||
elements = partition(file=f, include_header=False, skip_infer_table_types=[])
|
||||
|
||||
assert sum(isinstance(element, Table) for element in elements) == 2
|
||||
assert sum(isinstance(element, Title) for element in elements) == 2
|
||||
@ -834,7 +861,7 @@ EXPECTED_XLS_TABLE = (
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
|
||||
elements = partition(filename=filename, include_header=False)
|
||||
elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
|
||||
|
||||
assert sum(isinstance(element, Table) for element in elements) == 2
|
||||
assert len(elements) == 18
|
||||
|
@ -51,6 +51,27 @@ def test_partition_xlsx_from_filename_with_metadata_filename(
|
||||
assert elements[0].metadata.filename == "test"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_table_structure",
|
||||
[
|
||||
True,
|
||||
False,
|
||||
],
|
||||
)
|
||||
def test_partition_xlsx_infer_table_structure(
|
||||
infer_table_structure,
|
||||
filename="example-docs/stanley-cups.xlsx",
|
||||
):
|
||||
elements = partition_xlsx(filename=filename, infer_table_structure=infer_table_structure)
|
||||
table_elements = [e for e in elements if isinstance(e, Table)]
|
||||
for table_element in table_elements:
|
||||
table_element_has_text_as_html_field = (
|
||||
hasattr(table_element.metadata, "text_as_html")
|
||||
and table_element.metadata.text_as_html is not None
|
||||
)
|
||||
assert table_element_has_text_as_html_field == infer_table_structure
|
||||
|
||||
|
||||
def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
|
||||
elements = partition_xlsx(filename=filename, include_header=True)
|
||||
assert sum(isinstance(element, Table) for element in elements) == 2
|
||||
|
@ -107,8 +107,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"page_name": "Stanley Cups",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Stanley Cups"
|
||||
},
|
||||
"text": "Stanley Cups"
|
||||
},
|
||||
@ -220,8 +219,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"page_name": "Stanley Cups",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Stanley Cups"
|
||||
},
|
||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
|
||||
},
|
||||
@ -333,8 +331,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 2,
|
||||
"page_name": "Stanley Cups Since 67",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Stanley Cups Since 67"
|
||||
},
|
||||
"text": "Stanley Cups Since 67"
|
||||
},
|
||||
@ -446,8 +443,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 2,
|
||||
"page_name": "Stanley Cups Since 67",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Stanley Cups Since 67"
|
||||
},
|
||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
|
||||
}
|
||||
|
@ -18,8 +18,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"page_name": "Stanley Cups",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Stanley Cups"
|
||||
},
|
||||
"text": "Stanley Cups"
|
||||
},
|
||||
@ -42,8 +41,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"page_name": "Stanley Cups",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Stanley Cups"
|
||||
},
|
||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
|
||||
},
|
||||
@ -66,8 +64,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 2,
|
||||
"page_name": "Stanley Cups Since 67",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Stanley Cups Since 67"
|
||||
},
|
||||
"text": "Stanley Cups Since 67"
|
||||
},
|
||||
@ -90,8 +87,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 2,
|
||||
"page_name": "Stanley Cups Since 67",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Stanley Cups Since 67"
|
||||
},
|
||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
|
||||
}
|
||||
|
@ -18,8 +18,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"page_name": "Example Test",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>MC</td>\n <td>What is 2+2?</td>\n <td>4</td>\n <td>correct</td>\n <td>3</td>\n <td>incorrect</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Example Test"
|
||||
},
|
||||
"text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
|
||||
},
|
||||
@ -88,8 +87,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 2,
|
||||
"page_name": "Format Abbr.",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
|
||||
"page_name": "Format Abbr."
|
||||
},
|
||||
"text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
|
||||
},
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.10.26-dev1" # pragma: no cover
|
||||
__version__ = "0.10.26-dev2" # pragma: no cover
|
||||
|
@ -265,6 +265,7 @@ def partition(
|
||||
elements = _partition_doc(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -274,6 +275,7 @@ def partition(
|
||||
elements = _partition_docx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -283,6 +285,7 @@ def partition(
|
||||
elements = _partition_odt(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -331,6 +334,7 @@ def partition(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -351,6 +355,7 @@ def partition(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -361,6 +366,7 @@ def partition(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -406,6 +412,7 @@ def partition(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -416,6 +423,7 @@ def partition(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -426,6 +434,7 @@ def partition(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -442,6 +451,7 @@ def partition(
|
||||
elements = _partition_xlsx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
@ -451,6 +461,7 @@ def partition(
|
||||
elements = _partition_csv(
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
**kwargs,
|
||||
|
@ -32,6 +32,7 @@ def partition_csv(
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
languages: Optional[List[str]] = ["auto"],
|
||||
# NOTE (jennings) partition_csv generates a single TableElement
|
||||
# so detect_language_per_element is not included as a param
|
||||
@ -51,6 +52,12 @@ def partition_csv(
|
||||
The last modified date for the document.
|
||||
include_metadata
|
||||
Determines whether or not metadata is included in the output.
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
@ -74,11 +81,12 @@ def partition_csv(
|
||||
|
||||
if include_metadata:
|
||||
metadata = ElementMetadata(
|
||||
text_as_html=html_text,
|
||||
filename=metadata_filename or filename,
|
||||
last_modified=metadata_last_modified or last_modification_date,
|
||||
languages=languages,
|
||||
)
|
||||
if infer_table_structure:
|
||||
metadata.text_as_html = html_text
|
||||
else:
|
||||
metadata = ElementMetadata()
|
||||
|
||||
|
@ -88,6 +88,7 @@ def convert_and_partition_docx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
include_metadata: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
languages: Optional[List[str]] = ["auto"],
|
||||
@ -108,6 +109,12 @@ def convert_and_partition_docx(
|
||||
include_metadata
|
||||
Determines whether or not metadata is included in the metadata attribute on the elements in
|
||||
the output.
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
@ -153,6 +160,7 @@ def convert_and_partition_docx(
|
||||
filename=docx_path,
|
||||
metadata_filename=metadata_filename,
|
||||
include_metadata=include_metadata,
|
||||
infer_table_structure=infer_table_structure,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
@ -170,6 +178,7 @@ def partition_docx(
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_page_breaks: bool = True,
|
||||
include_metadata: bool = True, # used by decorator
|
||||
infer_table_structure: bool = True,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
chunking_strategy: Optional[str] = None, # used by decorator
|
||||
languages: Optional[List[str]] = ["auto"],
|
||||
@ -184,6 +193,12 @@ def partition_docx(
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
metadata_filename
|
||||
The filename to use for the metadata. Relevant because partition_doc converts the document
|
||||
to .docx before partition. We want the original source filename in the metadata.
|
||||
@ -205,6 +220,7 @@ def partition_docx(
|
||||
file,
|
||||
metadata_filename,
|
||||
include_page_breaks,
|
||||
infer_table_structure,
|
||||
metadata_last_modified,
|
||||
)
|
||||
elements = apply_lang_metadata(
|
||||
@ -246,12 +262,14 @@ class _DocxPartitioner:
|
||||
file: Optional[IO[bytes]],
|
||||
metadata_filename: Optional[str],
|
||||
include_page_breaks: bool,
|
||||
infer_table_structure: bool,
|
||||
metadata_last_modified: Optional[str],
|
||||
) -> None:
|
||||
self._filename = filename
|
||||
self._file = file
|
||||
self._metadata_filename = metadata_filename
|
||||
self._include_page_breaks = include_page_breaks
|
||||
self._infer_table_structure = infer_table_structure
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._page_counter: int = 1
|
||||
|
||||
@ -262,6 +280,7 @@ class _DocxPartitioner:
|
||||
file: Optional[IO[bytes]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_page_breaks: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
) -> Iterator[Element]:
|
||||
"""Partition MS Word documents (.docx format) into its document elements."""
|
||||
@ -270,6 +289,7 @@ class _DocxPartitioner:
|
||||
file,
|
||||
metadata_filename,
|
||||
include_page_breaks,
|
||||
infer_table_structure,
|
||||
metadata_last_modified,
|
||||
)._iter_document_elements()
|
||||
|
||||
@ -536,8 +556,9 @@ class _DocxPartitioner:
|
||||
"""Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
|
||||
# -- at present, we always generate exactly one Table element, but we might want
|
||||
# -- to skip, for example, an empty table, or accommodate nested tables.
|
||||
|
||||
html_table = convert_ms_office_table_to_text(table, as_html=True)
|
||||
html_table = None
|
||||
if self._infer_table_structure:
|
||||
html_table = convert_ms_office_table_to_text(table, as_html=True)
|
||||
text_table = convert_ms_office_table_to_text(table, as_html=False)
|
||||
emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
|
||||
|
||||
|
@ -17,6 +17,7 @@ def partition_odt(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[BinaryIO] = None,
|
||||
include_metadata: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
chunking_strategy: Optional[str] = None,
|
||||
@ -32,6 +33,12 @@ def partition_odt(
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
languages
|
||||
@ -53,6 +60,7 @@ def partition_odt(
|
||||
source_format="odt",
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||
languages=languages,
|
||||
|
@ -22,6 +22,7 @@ def partition_ppt(
|
||||
file: Optional[IO[bytes]] = None,
|
||||
include_page_breaks: bool = False,
|
||||
include_metadata: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
chunking_strategy: Optional[str] = None,
|
||||
@ -39,6 +40,12 @@ def partition_ppt(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_page_breaks
|
||||
If True, includes a PageBreak element between slides
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
languages
|
||||
@ -82,6 +89,7 @@ def partition_ppt(
|
||||
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
||||
elements = partition_pptx(
|
||||
filename=pptx_filename,
|
||||
infer_table_structure=infer_table_structure,
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||
languages=languages,
|
||||
|
@ -56,6 +56,7 @@ def partition_pptx(
|
||||
include_metadata: bool = True,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
include_slide_notes: bool = False,
|
||||
infer_table_structure: bool = True,
|
||||
chunking_strategy: Optional[str] = None,
|
||||
languages: Optional[List[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
@ -79,6 +80,12 @@ def partition_pptx(
|
||||
The last modified date for the document.
|
||||
include_slide_notes
|
||||
If True, includes the slide notes as element
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
@ -104,6 +111,7 @@ def partition_pptx(
|
||||
source_file,
|
||||
include_page_breaks,
|
||||
include_slide_notes,
|
||||
infer_table_structure,
|
||||
metadata_filename,
|
||||
metadata_last_modified,
|
||||
)
|
||||
@ -126,12 +134,14 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
# -- this object in tests and makes them less sensitive to signature changes.
|
||||
include_page_breaks: bool = True,
|
||||
include_slide_notes: bool = False,
|
||||
infer_table_structure: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
) -> None:
|
||||
self._file = file
|
||||
self._include_page_breaks = include_page_breaks
|
||||
self._include_slide_notes = include_slide_notes
|
||||
self._infer_table_structure = infer_table_structure
|
||||
self._metadata_filename = metadata_filename
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._page_counter = 0
|
||||
@ -142,6 +152,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
file: Union[str, IO[bytes]],
|
||||
include_page_breaks: bool,
|
||||
include_slide_notes: bool,
|
||||
infer_table_structure: bool,
|
||||
metadata_filename: Optional[str],
|
||||
metadata_last_modified: Optional[str],
|
||||
) -> Iterator[Element]:
|
||||
@ -150,6 +161,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
file,
|
||||
include_page_breaks,
|
||||
include_slide_notes,
|
||||
infer_table_structure,
|
||||
metadata_filename,
|
||||
metadata_last_modified,
|
||||
)._iter_presentation_elements()
|
||||
@ -319,7 +331,9 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip()
|
||||
if not text_table:
|
||||
return
|
||||
html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
|
||||
html_table = None
|
||||
if self._infer_table_structure:
|
||||
html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
|
||||
yield Table(
|
||||
text=text_table,
|
||||
metadata=self._table_metadata(html_table),
|
||||
@ -348,7 +362,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
|
||||
"""Orders the shapes on `slide` from top to bottom and left to right.
|
||||
|
||||
Returns the the title shape if it exists and the ordered shapes."""
|
||||
Returns the title shape if it exists and the ordered shapes."""
|
||||
|
||||
def iter_shapes(shapes: _BaseGroupShapes) -> Iterator[BaseShape]:
|
||||
for shape in shapes:
|
||||
|
@ -44,6 +44,7 @@ def partition_xlsx(
|
||||
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
languages: Optional[List[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
@ -61,6 +62,12 @@ def partition_xlsx(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_metadata
|
||||
Determines whether or not metadata is included in the output.
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
languages
|
||||
User defined value for metadata.languages if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
@ -71,7 +78,7 @@ def partition_xlsx(
|
||||
metadata_last_modified
|
||||
The day of the last modification
|
||||
include_header
|
||||
Determines whether or not header info info is included in text and medatada.text_as_html
|
||||
Determines whether or not header info is included in text and medatada.text_as_html
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -94,7 +101,11 @@ def partition_xlsx(
|
||||
for sheet_name, sheet in sheets.items():
|
||||
page_number += 1
|
||||
if not find_subtable:
|
||||
html_text = sheet.to_html(index=False, header=include_header, na_rep="")
|
||||
html_text = (
|
||||
sheet.to_html(index=False, header=include_header, na_rep="")
|
||||
if infer_table_structure
|
||||
else None
|
||||
)
|
||||
text = soupparser_fromstring(html_text).text_content()
|
||||
|
||||
if include_metadata:
|
||||
@ -158,7 +169,7 @@ def partition_xlsx(
|
||||
text = soupparser_fromstring(html_text).text_content()
|
||||
subtable = Table(text=text)
|
||||
subtable.metadata = metadata
|
||||
subtable.metadata.text_as_html = html_text
|
||||
subtable.metadata.text_as_html = html_text if infer_table_structure else None
|
||||
elements.append(subtable)
|
||||
|
||||
if front_non_consecutive is not None and last_non_consecutive is not None:
|
||||
|
Loading…
x
Reference in New Issue
Block a user