mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
chore: fix infer_table bug (#1833)
Carrying `skip_infer_table_types` to `infer_table_structure` in
partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a
`text_as_html` field.
Note: I've continued to exclude this var from partitioners that go
through html flow, I think if we've already got the html it doesn't make
sense to carry the infer variable along, since we're not 'infer-ing' the
html table in these cases.
TODO:
✅ add unit tests
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
This commit is contained in:
parent
6707cab250
commit
0584e1d031
@ -1,4 +1,4 @@
|
|||||||
## 0.10.26-dev1
|
## 0.10.26-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -10,6 +10,8 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* **Fix a bug on Table partitioning** Previously the `skip_infer_table_types` variable used in partition was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable in partition to pass the filetype you want to exclude `text_as_html` metadata field for, or the `infer_table_structure` boolean variable on the file specific partitioning function.
|
||||||
|
|
||||||
## 0.10.25
|
## 0.10.25
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -460,7 +460,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens
|
|||||||
Table Extraction for other filetypes
|
Table Extraction for other filetypes
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
|
||||||
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
|
We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs, Images, and Excel files which are ``pdf``, ``jpg``, ``png``, ``xlsx``, and ``xls``. Note that table extraction for Images and PDFs only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with:
|
||||||
|
|
||||||
.. tabs::
|
.. tabs::
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ def test_it_splits_a_large_section_into_multiple_chunks():
|
|||||||
Title("Introduction"),
|
Title("Introduction"),
|
||||||
Text(
|
Text(
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
|
||||||
" porta volutpat."
|
" porta volutpat.",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -35,6 +35,24 @@ def test_partition_csv_from_filename(filename, expected_text, expected_table):
|
|||||||
assert elements[0].metadata.filename == filename
|
assert elements[0].metadata.filename == filename
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"infer_table_structure",
|
||||||
|
[
|
||||||
|
True,
|
||||||
|
False,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_csv_from_filename_infer_table_structure(infer_table_structure):
|
||||||
|
f_path = "example-docs/stanley-cups.csv"
|
||||||
|
elements = partition_csv(filename=f_path, infer_table_structure=infer_table_structure)
|
||||||
|
|
||||||
|
table_element_has_text_as_html_field = (
|
||||||
|
hasattr(elements[0].metadata, "text_as_html")
|
||||||
|
and elements[0].metadata.text_as_html is not None
|
||||||
|
)
|
||||||
|
assert table_element_has_text_as_html_field == infer_table_structure
|
||||||
|
|
||||||
|
|
||||||
def test_partition_csv_from_filename_with_metadata_filename(
|
def test_partition_csv_from_filename_with_metadata_filename(
|
||||||
filename="example-docs/stanley-cups.csv",
|
filename="example-docs/stanley-cups.csv",
|
||||||
):
|
):
|
||||||
|
@ -74,6 +74,25 @@ def test_partition_docx_from_file(mock_document, expected_elements, tmpdir):
|
|||||||
assert element.metadata.filename is None
|
assert element.metadata.filename is None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"infer_table_structure",
|
||||||
|
[
|
||||||
|
True,
|
||||||
|
False,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_docx_infer_table_structure(infer_table_structure):
|
||||||
|
elements = partition_docx(
|
||||||
|
filename="example-docs/fake_table.docx",
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
|
)
|
||||||
|
table_element_has_text_as_html_field = (
|
||||||
|
hasattr(elements[0].metadata, "text_as_html")
|
||||||
|
and elements[0].metadata.text_as_html is not None
|
||||||
|
)
|
||||||
|
assert table_element_has_text_as_html_field == infer_table_structure
|
||||||
|
|
||||||
|
|
||||||
def test_partition_docx_from_file_with_metadata_filename(mock_document, expected_elements, tmpdir):
|
def test_partition_docx_from_file_with_metadata_filename(mock_document, expected_elements, tmpdir):
|
||||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
mock_document.save(filename)
|
mock_document.save(filename)
|
||||||
@ -265,6 +284,7 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: List[Dic
|
|||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
False,
|
False,
|
||||||
|
True,
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
paragraph = partitioner._document.paragraphs[1]
|
paragraph = partitioner._document.paragraphs[1]
|
||||||
@ -289,6 +309,7 @@ def test_iter_table_emphasis(expected_emphasized_texts: List[Dict[str, str]]):
|
|||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
False,
|
False,
|
||||||
|
True,
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
table = partitioner._document.tables[0]
|
table = partitioner._document.tables[0]
|
||||||
@ -305,6 +326,7 @@ def test_table_emphasis(
|
|||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
False,
|
False,
|
||||||
|
True,
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
table = partitioner._document.tables[0]
|
table = partitioner._document.tables[0]
|
||||||
@ -350,7 +372,14 @@ def test_partition_docx_with_json(mock_document, tmpdir):
|
|||||||
|
|
||||||
|
|
||||||
def test_parse_category_depth_by_style():
|
def test_parse_category_depth_by_style():
|
||||||
partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None)
|
partitioner = _DocxPartitioner(
|
||||||
|
"example-docs/category-level.docx",
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
False,
|
||||||
|
True,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
# Category depths are 0-indexed and relative to the category type
|
# Category depths are 0-indexed and relative to the category type
|
||||||
# Title, list item, bullet, narrative text, etc.
|
# Title, list item, bullet, narrative text, etc.
|
||||||
@ -381,7 +410,7 @@ def test_parse_category_depth_by_style():
|
|||||||
|
|
||||||
|
|
||||||
def test_parse_category_depth_by_style_name():
|
def test_parse_category_depth_by_style_name():
|
||||||
partitioner = _DocxPartitioner(None, None, None, False, None)
|
partitioner = _DocxPartitioner(None, None, None, False, True, None)
|
||||||
|
|
||||||
test_cases = [
|
test_cases = [
|
||||||
(0, "Heading 1"),
|
(0, "Heading 1"),
|
||||||
@ -406,7 +435,7 @@ def test_parse_category_depth_by_style_name():
|
|||||||
|
|
||||||
|
|
||||||
def test_parse_category_depth_by_style_ilvl():
|
def test_parse_category_depth_by_style_ilvl():
|
||||||
partitioner = _DocxPartitioner(None, None, None, False, None)
|
partitioner = _DocxPartitioner(None, None, None, False, True, None)
|
||||||
assert partitioner._parse_category_depth_by_style_ilvl() == 0
|
assert partitioner._parse_category_depth_by_style_ilvl() == 0
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
from unstructured.documents.elements import Table, TableChunk, Title
|
from unstructured.documents.elements import Table, TableChunk, Title
|
||||||
@ -54,6 +56,24 @@ def test_partition_odt_from_file():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"infer_table_structure",
|
||||||
|
[
|
||||||
|
True,
|
||||||
|
False,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_odt_infer_table_structure(infer_table_structure):
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
elements = partition_odt(file=f, infer_table_structure=infer_table_structure)
|
||||||
|
table_element_has_text_as_html_field = (
|
||||||
|
hasattr(elements[1].metadata, "text_as_html")
|
||||||
|
and elements[1].metadata.text_as_html is not None
|
||||||
|
)
|
||||||
|
assert table_element_has_text_as_html_field == infer_table_structure
|
||||||
|
|
||||||
|
|
||||||
def test_partition_odt_from_file_with_metadata_filename():
|
def test_partition_odt_from_file_with_metadata_filename():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
|
@ -262,6 +262,26 @@ def test_partition_pptx_grabs_tables():
|
|||||||
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
|
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"infer_table_structure",
|
||||||
|
[
|
||||||
|
True,
|
||||||
|
False,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_pptx_infer_table_structure(infer_table_structure):
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
|
||||||
|
elements = cast(
|
||||||
|
Sequence[Text],
|
||||||
|
partition_pptx(filename=filename, infer_table_structure=infer_table_structure),
|
||||||
|
)
|
||||||
|
table_element_has_text_as_html_field = (
|
||||||
|
hasattr(elements[1].metadata, "text_as_html")
|
||||||
|
and elements[1].metadata.text_as_html is not None
|
||||||
|
)
|
||||||
|
assert table_element_has_text_as_html_field == infer_table_structure
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pptx_malformed():
|
def test_partition_pptx_malformed():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||||
elements = cast(Sequence[Text], partition_pptx(filename=filename))
|
elements = cast(Sequence[Text], partition_pptx(filename=filename))
|
||||||
|
@ -713,7 +713,7 @@ EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsh
|
|||||||
|
|
||||||
|
|
||||||
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
||||||
elements = partition(filename=filename, include_header=False)
|
elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
|
||||||
|
|
||||||
assert sum(isinstance(element, Table) for element in elements) == 2
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
||||||
assert sum(isinstance(element, Title) for element in elements) == 2
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
||||||
@ -726,9 +726,36 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
|
|||||||
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("skip_infer_table_types", "filename", "has_text_as_html_field"),
|
||||||
|
[
|
||||||
|
(["xlsx"], "stanley-cups.xlsx", False),
|
||||||
|
([], "stanley-cups.xlsx", True),
|
||||||
|
(["odt"], "fake.odt", False),
|
||||||
|
([], "fake.odt", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_auto_partition_respects_skip_infer_table_types(
|
||||||
|
skip_infer_table_types, filename, has_text_as_html_field
|
||||||
|
):
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
table_elements = [
|
||||||
|
e
|
||||||
|
for e in partition(file=f, skip_infer_table_types=skip_infer_table_types)
|
||||||
|
if isinstance(e, Table)
|
||||||
|
]
|
||||||
|
for table_element in table_elements:
|
||||||
|
table_element_has_text_as_html_field = (
|
||||||
|
hasattr(table_element.metadata, "text_as_html")
|
||||||
|
and table_element.metadata.text_as_html is not None
|
||||||
|
)
|
||||||
|
assert table_element_has_text_as_html_field == has_text_as_html_field
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
|
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
elements = partition(file=f, include_header=False)
|
elements = partition(file=f, include_header=False, skip_infer_table_types=[])
|
||||||
|
|
||||||
assert sum(isinstance(element, Table) for element in elements) == 2
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
||||||
assert sum(isinstance(element, Title) for element in elements) == 2
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
||||||
@ -834,7 +861,7 @@ EXPECTED_XLS_TABLE = (
|
|||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||||
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
|
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
|
||||||
elements = partition(filename=filename, include_header=False)
|
elements = partition(filename=filename, include_header=False, skip_infer_table_types=[])
|
||||||
|
|
||||||
assert sum(isinstance(element, Table) for element in elements) == 2
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
||||||
assert len(elements) == 18
|
assert len(elements) == 18
|
||||||
|
@ -51,6 +51,27 @@ def test_partition_xlsx_from_filename_with_metadata_filename(
|
|||||||
assert elements[0].metadata.filename == "test"
|
assert elements[0].metadata.filename == "test"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"infer_table_structure",
|
||||||
|
[
|
||||||
|
True,
|
||||||
|
False,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_xlsx_infer_table_structure(
|
||||||
|
infer_table_structure,
|
||||||
|
filename="example-docs/stanley-cups.xlsx",
|
||||||
|
):
|
||||||
|
elements = partition_xlsx(filename=filename, infer_table_structure=infer_table_structure)
|
||||||
|
table_elements = [e for e in elements if isinstance(e, Table)]
|
||||||
|
for table_element in table_elements:
|
||||||
|
table_element_has_text_as_html_field = (
|
||||||
|
hasattr(table_element.metadata, "text_as_html")
|
||||||
|
and table_element.metadata.text_as_html is not None
|
||||||
|
)
|
||||||
|
assert table_element_has_text_as_html_field == infer_table_structure
|
||||||
|
|
||||||
|
|
||||||
def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
|
def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
|
||||||
elements = partition_xlsx(filename=filename, include_header=True)
|
elements = partition_xlsx(filename=filename, include_header=True)
|
||||||
assert sum(isinstance(element, Table) for element in elements) == 2
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
||||||
|
@ -107,8 +107,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"page_name": "Stanley Cups",
|
"page_name": "Stanley Cups"
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "Stanley Cups"
|
"text": "Stanley Cups"
|
||||||
},
|
},
|
||||||
@ -220,8 +219,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"page_name": "Stanley Cups",
|
"page_name": "Stanley Cups"
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
|
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
|
||||||
},
|
},
|
||||||
@ -333,8 +331,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 2,
|
"page_number": 2,
|
||||||
"page_name": "Stanley Cups Since 67",
|
"page_name": "Stanley Cups Since 67"
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "Stanley Cups Since 67"
|
"text": "Stanley Cups Since 67"
|
||||||
},
|
},
|
||||||
@ -446,8 +443,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 2,
|
"page_number": 2,
|
||||||
"page_name": "Stanley Cups Since 67",
|
"page_name": "Stanley Cups Since 67"
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
|
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
|
||||||
}
|
}
|
||||||
|
@ -18,8 +18,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"page_name": "Stanley Cups",
|
"page_name": "Stanley Cups"
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "Stanley Cups"
|
"text": "Stanley Cups"
|
||||||
},
|
},
|
||||||
@ -42,8 +41,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"page_name": "Stanley Cups",
|
"page_name": "Stanley Cups"
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
|
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
|
||||||
},
|
},
|
||||||
@ -66,8 +64,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 2,
|
"page_number": 2,
|
||||||
"page_name": "Stanley Cups Since 67",
|
"page_name": "Stanley Cups Since 67"
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "Stanley Cups Since 67"
|
"text": "Stanley Cups Since 67"
|
||||||
},
|
},
|
||||||
@ -90,8 +87,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 2,
|
"page_number": 2,
|
||||||
"page_name": "Stanley Cups Since 67",
|
"page_name": "Stanley Cups Since 67"
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
|
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
|
||||||
}
|
}
|
||||||
|
@ -18,8 +18,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 1,
|
"page_number": 1,
|
||||||
"page_name": "Example Test",
|
"page_name": "Example Test"
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>MC</td>\n <td>What is 2+2?</td>\n <td>4</td>\n <td>correct</td>\n <td>3</td>\n <td>incorrect</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
|
"text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
|
||||||
},
|
},
|
||||||
@ -88,8 +87,7 @@
|
|||||||
"eng"
|
"eng"
|
||||||
],
|
],
|
||||||
"page_number": 2,
|
"page_number": 2,
|
||||||
"page_name": "Format Abbr.",
|
"page_name": "Format Abbr."
|
||||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
|
|
||||||
},
|
},
|
||||||
"text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
|
"text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
|
||||||
},
|
},
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.10.26-dev1" # pragma: no cover
|
__version__ = "0.10.26-dev2" # pragma: no cover
|
||||||
|
@ -265,6 +265,7 @@ def partition(
|
|||||||
elements = _partition_doc(
|
elements = _partition_doc(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -274,6 +275,7 @@ def partition(
|
|||||||
elements = _partition_docx(
|
elements = _partition_docx(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -283,6 +285,7 @@ def partition(
|
|||||||
elements = _partition_odt(
|
elements = _partition_odt(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -331,6 +334,7 @@ def partition(
|
|||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -351,6 +355,7 @@ def partition(
|
|||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -361,6 +366,7 @@ def partition(
|
|||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -406,6 +412,7 @@ def partition(
|
|||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -416,6 +423,7 @@ def partition(
|
|||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -426,6 +434,7 @@ def partition(
|
|||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -442,6 +451,7 @@ def partition(
|
|||||||
elements = _partition_xlsx(
|
elements = _partition_xlsx(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -451,6 +461,7 @@ def partition(
|
|||||||
elements = _partition_csv(
|
elements = _partition_csv(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
@ -32,6 +32,7 @@ def partition_csv(
|
|||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
|
infer_table_structure: bool = True,
|
||||||
languages: Optional[List[str]] = ["auto"],
|
languages: Optional[List[str]] = ["auto"],
|
||||||
# NOTE (jennings) partition_csv generates a single TableElement
|
# NOTE (jennings) partition_csv generates a single TableElement
|
||||||
# so detect_language_per_element is not included as a param
|
# so detect_language_per_element is not included as a param
|
||||||
@ -51,6 +52,12 @@ def partition_csv(
|
|||||||
The last modified date for the document.
|
The last modified date for the document.
|
||||||
include_metadata
|
include_metadata
|
||||||
Determines whether or not metadata is included in the output.
|
Determines whether or not metadata is included in the output.
|
||||||
|
infer_table_structure
|
||||||
|
If True, any Table elements that are extracted will also have a metadata field
|
||||||
|
named "text_as_html" where the table's text content is rendered into an html string.
|
||||||
|
I.e., rows and cells are preserved.
|
||||||
|
Whether True or False, the "text" field is always present in any Table element
|
||||||
|
and is the text content of the table (no structure).
|
||||||
languages
|
languages
|
||||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||||
@ -74,11 +81,12 @@ def partition_csv(
|
|||||||
|
|
||||||
if include_metadata:
|
if include_metadata:
|
||||||
metadata = ElementMetadata(
|
metadata = ElementMetadata(
|
||||||
text_as_html=html_text,
|
|
||||||
filename=metadata_filename or filename,
|
filename=metadata_filename or filename,
|
||||||
last_modified=metadata_last_modified or last_modification_date,
|
last_modified=metadata_last_modified or last_modification_date,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
)
|
)
|
||||||
|
if infer_table_structure:
|
||||||
|
metadata.text_as_html = html_text
|
||||||
else:
|
else:
|
||||||
metadata = ElementMetadata()
|
metadata = ElementMetadata()
|
||||||
|
|
||||||
|
@ -88,6 +88,7 @@ def convert_and_partition_docx(
|
|||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
|
infer_table_structure: bool = True,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
languages: Optional[List[str]] = ["auto"],
|
languages: Optional[List[str]] = ["auto"],
|
||||||
@ -108,6 +109,12 @@ def convert_and_partition_docx(
|
|||||||
include_metadata
|
include_metadata
|
||||||
Determines whether or not metadata is included in the metadata attribute on the elements in
|
Determines whether or not metadata is included in the metadata attribute on the elements in
|
||||||
the output.
|
the output.
|
||||||
|
infer_table_structure
|
||||||
|
If True, any Table elements that are extracted will also have a metadata field
|
||||||
|
named "text_as_html" where the table's text content is rendered into an html string.
|
||||||
|
I.e., rows and cells are preserved.
|
||||||
|
Whether True or False, the "text" field is always present in any Table element
|
||||||
|
and is the text content of the table (no structure).
|
||||||
languages
|
languages
|
||||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||||
@ -153,6 +160,7 @@ def convert_and_partition_docx(
|
|||||||
filename=docx_path,
|
filename=docx_path,
|
||||||
metadata_filename=metadata_filename,
|
metadata_filename=metadata_filename,
|
||||||
include_metadata=include_metadata,
|
include_metadata=include_metadata,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
@ -170,6 +178,7 @@ def partition_docx(
|
|||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
include_page_breaks: bool = True,
|
include_page_breaks: bool = True,
|
||||||
include_metadata: bool = True, # used by decorator
|
include_metadata: bool = True, # used by decorator
|
||||||
|
infer_table_structure: bool = True,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
chunking_strategy: Optional[str] = None, # used by decorator
|
chunking_strategy: Optional[str] = None, # used by decorator
|
||||||
languages: Optional[List[str]] = ["auto"],
|
languages: Optional[List[str]] = ["auto"],
|
||||||
@ -184,6 +193,12 @@ def partition_docx(
|
|||||||
A string defining the target filename path.
|
A string defining the target filename path.
|
||||||
file
|
file
|
||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
|
infer_table_structure
|
||||||
|
If True, any Table elements that are extracted will also have a metadata field
|
||||||
|
named "text_as_html" where the table's text content is rendered into an html string.
|
||||||
|
I.e., rows and cells are preserved.
|
||||||
|
Whether True or False, the "text" field is always present in any Table element
|
||||||
|
and is the text content of the table (no structure).
|
||||||
metadata_filename
|
metadata_filename
|
||||||
The filename to use for the metadata. Relevant because partition_doc converts the document
|
The filename to use for the metadata. Relevant because partition_doc converts the document
|
||||||
to .docx before partition. We want the original source filename in the metadata.
|
to .docx before partition. We want the original source filename in the metadata.
|
||||||
@ -205,6 +220,7 @@ def partition_docx(
|
|||||||
file,
|
file,
|
||||||
metadata_filename,
|
metadata_filename,
|
||||||
include_page_breaks,
|
include_page_breaks,
|
||||||
|
infer_table_structure,
|
||||||
metadata_last_modified,
|
metadata_last_modified,
|
||||||
)
|
)
|
||||||
elements = apply_lang_metadata(
|
elements = apply_lang_metadata(
|
||||||
@ -246,12 +262,14 @@ class _DocxPartitioner:
|
|||||||
file: Optional[IO[bytes]],
|
file: Optional[IO[bytes]],
|
||||||
metadata_filename: Optional[str],
|
metadata_filename: Optional[str],
|
||||||
include_page_breaks: bool,
|
include_page_breaks: bool,
|
||||||
|
infer_table_structure: bool,
|
||||||
metadata_last_modified: Optional[str],
|
metadata_last_modified: Optional[str],
|
||||||
) -> None:
|
) -> None:
|
||||||
self._filename = filename
|
self._filename = filename
|
||||||
self._file = file
|
self._file = file
|
||||||
self._metadata_filename = metadata_filename
|
self._metadata_filename = metadata_filename
|
||||||
self._include_page_breaks = include_page_breaks
|
self._include_page_breaks = include_page_breaks
|
||||||
|
self._infer_table_structure = infer_table_structure
|
||||||
self._metadata_last_modified = metadata_last_modified
|
self._metadata_last_modified = metadata_last_modified
|
||||||
self._page_counter: int = 1
|
self._page_counter: int = 1
|
||||||
|
|
||||||
@ -262,6 +280,7 @@ class _DocxPartitioner:
|
|||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
include_page_breaks: bool = True,
|
include_page_breaks: bool = True,
|
||||||
|
infer_table_structure: bool = True,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
) -> Iterator[Element]:
|
) -> Iterator[Element]:
|
||||||
"""Partition MS Word documents (.docx format) into its document elements."""
|
"""Partition MS Word documents (.docx format) into its document elements."""
|
||||||
@ -270,6 +289,7 @@ class _DocxPartitioner:
|
|||||||
file,
|
file,
|
||||||
metadata_filename,
|
metadata_filename,
|
||||||
include_page_breaks,
|
include_page_breaks,
|
||||||
|
infer_table_structure,
|
||||||
metadata_last_modified,
|
metadata_last_modified,
|
||||||
)._iter_document_elements()
|
)._iter_document_elements()
|
||||||
|
|
||||||
@ -536,7 +556,8 @@ class _DocxPartitioner:
|
|||||||
"""Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
|
"""Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
|
||||||
# -- at present, we always generate exactly one Table element, but we might want
|
# -- at present, we always generate exactly one Table element, but we might want
|
||||||
# -- to skip, for example, an empty table, or accommodate nested tables.
|
# -- to skip, for example, an empty table, or accommodate nested tables.
|
||||||
|
html_table = None
|
||||||
|
if self._infer_table_structure:
|
||||||
html_table = convert_ms_office_table_to_text(table, as_html=True)
|
html_table = convert_ms_office_table_to_text(table, as_html=True)
|
||||||
text_table = convert_ms_office_table_to_text(table, as_html=False)
|
text_table = convert_ms_office_table_to_text(table, as_html=False)
|
||||||
emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
|
emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
|
||||||
|
@ -17,6 +17,7 @@ def partition_odt(
|
|||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
file: Optional[BinaryIO] = None,
|
file: Optional[BinaryIO] = None,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
|
infer_table_structure: bool = True,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
chunking_strategy: Optional[str] = None,
|
chunking_strategy: Optional[str] = None,
|
||||||
@ -32,6 +33,12 @@ def partition_odt(
|
|||||||
A string defining the target filename path.
|
A string defining the target filename path.
|
||||||
file
|
file
|
||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
|
infer_table_structure
|
||||||
|
If True, any Table elements that are extracted will also have a metadata field
|
||||||
|
named "text_as_html" where the table's text content is rendered into an html string.
|
||||||
|
I.e., rows and cells are preserved.
|
||||||
|
Whether True or False, the "text" field is always present in any Table element
|
||||||
|
and is the text content of the table (no structure).
|
||||||
metadata_last_modified
|
metadata_last_modified
|
||||||
The last modified date for the document.
|
The last modified date for the document.
|
||||||
languages
|
languages
|
||||||
@ -53,6 +60,7 @@ def partition_odt(
|
|||||||
source_format="odt",
|
source_format="odt",
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
metadata_filename=metadata_filename,
|
metadata_filename=metadata_filename,
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
|
@ -22,6 +22,7 @@ def partition_ppt(
|
|||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
|
infer_table_structure: bool = True,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
chunking_strategy: Optional[str] = None,
|
chunking_strategy: Optional[str] = None,
|
||||||
@ -39,6 +40,12 @@ def partition_ppt(
|
|||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
include_page_breaks
|
include_page_breaks
|
||||||
If True, includes a PageBreak element between slides
|
If True, includes a PageBreak element between slides
|
||||||
|
infer_table_structure
|
||||||
|
If True, any Table elements that are extracted will also have a metadata field
|
||||||
|
named "text_as_html" where the table's text content is rendered into an html string.
|
||||||
|
I.e., rows and cells are preserved.
|
||||||
|
Whether True or False, the "text" field is always present in any Table element
|
||||||
|
and is the text content of the table (no structure).
|
||||||
metadata_last_modified
|
metadata_last_modified
|
||||||
The last modified date for the document.
|
The last modified date for the document.
|
||||||
languages
|
languages
|
||||||
@ -82,6 +89,7 @@ def partition_ppt(
|
|||||||
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
||||||
elements = partition_pptx(
|
elements = partition_pptx(
|
||||||
filename=pptx_filename,
|
filename=pptx_filename,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
metadata_filename=metadata_filename,
|
metadata_filename=metadata_filename,
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
|
@ -56,6 +56,7 @@ def partition_pptx(
|
|||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
include_slide_notes: bool = False,
|
include_slide_notes: bool = False,
|
||||||
|
infer_table_structure: bool = True,
|
||||||
chunking_strategy: Optional[str] = None,
|
chunking_strategy: Optional[str] = None,
|
||||||
languages: Optional[List[str]] = ["auto"],
|
languages: Optional[List[str]] = ["auto"],
|
||||||
detect_language_per_element: bool = False,
|
detect_language_per_element: bool = False,
|
||||||
@ -79,6 +80,12 @@ def partition_pptx(
|
|||||||
The last modified date for the document.
|
The last modified date for the document.
|
||||||
include_slide_notes
|
include_slide_notes
|
||||||
If True, includes the slide notes as element
|
If True, includes the slide notes as element
|
||||||
|
infer_table_structure
|
||||||
|
If True, any Table elements that are extracted will also have a metadata field
|
||||||
|
named "text_as_html" where the table's text content is rendered into an html string.
|
||||||
|
I.e., rows and cells are preserved.
|
||||||
|
Whether True or False, the "text" field is always present in any Table element
|
||||||
|
and is the text content of the table (no structure).
|
||||||
languages
|
languages
|
||||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||||
@ -104,6 +111,7 @@ def partition_pptx(
|
|||||||
source_file,
|
source_file,
|
||||||
include_page_breaks,
|
include_page_breaks,
|
||||||
include_slide_notes,
|
include_slide_notes,
|
||||||
|
infer_table_structure,
|
||||||
metadata_filename,
|
metadata_filename,
|
||||||
metadata_last_modified,
|
metadata_last_modified,
|
||||||
)
|
)
|
||||||
@ -126,12 +134,14 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
|||||||
# -- this object in tests and makes them less sensitive to signature changes.
|
# -- this object in tests and makes them less sensitive to signature changes.
|
||||||
include_page_breaks: bool = True,
|
include_page_breaks: bool = True,
|
||||||
include_slide_notes: bool = False,
|
include_slide_notes: bool = False,
|
||||||
|
infer_table_structure: bool = True,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self._file = file
|
self._file = file
|
||||||
self._include_page_breaks = include_page_breaks
|
self._include_page_breaks = include_page_breaks
|
||||||
self._include_slide_notes = include_slide_notes
|
self._include_slide_notes = include_slide_notes
|
||||||
|
self._infer_table_structure = infer_table_structure
|
||||||
self._metadata_filename = metadata_filename
|
self._metadata_filename = metadata_filename
|
||||||
self._metadata_last_modified = metadata_last_modified
|
self._metadata_last_modified = metadata_last_modified
|
||||||
self._page_counter = 0
|
self._page_counter = 0
|
||||||
@ -142,6 +152,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
|||||||
file: Union[str, IO[bytes]],
|
file: Union[str, IO[bytes]],
|
||||||
include_page_breaks: bool,
|
include_page_breaks: bool,
|
||||||
include_slide_notes: bool,
|
include_slide_notes: bool,
|
||||||
|
infer_table_structure: bool,
|
||||||
metadata_filename: Optional[str],
|
metadata_filename: Optional[str],
|
||||||
metadata_last_modified: Optional[str],
|
metadata_last_modified: Optional[str],
|
||||||
) -> Iterator[Element]:
|
) -> Iterator[Element]:
|
||||||
@ -150,6 +161,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
|||||||
file,
|
file,
|
||||||
include_page_breaks,
|
include_page_breaks,
|
||||||
include_slide_notes,
|
include_slide_notes,
|
||||||
|
infer_table_structure,
|
||||||
metadata_filename,
|
metadata_filename,
|
||||||
metadata_last_modified,
|
metadata_last_modified,
|
||||||
)._iter_presentation_elements()
|
)._iter_presentation_elements()
|
||||||
@ -319,6 +331,8 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
|||||||
text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip()
|
text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip()
|
||||||
if not text_table:
|
if not text_table:
|
||||||
return
|
return
|
||||||
|
html_table = None
|
||||||
|
if self._infer_table_structure:
|
||||||
html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
|
html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
|
||||||
yield Table(
|
yield Table(
|
||||||
text=text_table,
|
text=text_table,
|
||||||
@ -348,7 +362,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
|||||||
def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
|
def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
|
||||||
"""Orders the shapes on `slide` from top to bottom and left to right.
|
"""Orders the shapes on `slide` from top to bottom and left to right.
|
||||||
|
|
||||||
Returns the the title shape if it exists and the ordered shapes."""
|
Returns the title shape if it exists and the ordered shapes."""
|
||||||
|
|
||||||
def iter_shapes(shapes: _BaseGroupShapes) -> Iterator[BaseShape]:
|
def iter_shapes(shapes: _BaseGroupShapes) -> Iterator[BaseShape]:
|
||||||
for shape in shapes:
|
for shape in shapes:
|
||||||
|
@ -44,6 +44,7 @@ def partition_xlsx(
|
|||||||
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
|
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
|
infer_table_structure: bool = True,
|
||||||
languages: Optional[List[str]] = ["auto"],
|
languages: Optional[List[str]] = ["auto"],
|
||||||
detect_language_per_element: bool = False,
|
detect_language_per_element: bool = False,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
@ -61,6 +62,12 @@ def partition_xlsx(
|
|||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
include_metadata
|
include_metadata
|
||||||
Determines whether or not metadata is included in the output.
|
Determines whether or not metadata is included in the output.
|
||||||
|
infer_table_structure
|
||||||
|
If True, any Table elements that are extracted will also have a metadata field
|
||||||
|
named "text_as_html" where the table's text content is rendered into an html string.
|
||||||
|
I.e., rows and cells are preserved.
|
||||||
|
Whether True or False, the "text" field is always present in any Table element
|
||||||
|
and is the text content of the table (no structure).
|
||||||
languages
|
languages
|
||||||
User defined value for metadata.languages if provided. Otherwise language is detected
|
User defined value for metadata.languages if provided. Otherwise language is detected
|
||||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||||
@ -71,7 +78,7 @@ def partition_xlsx(
|
|||||||
metadata_last_modified
|
metadata_last_modified
|
||||||
The day of the last modification
|
The day of the last modification
|
||||||
include_header
|
include_header
|
||||||
Determines whether or not header info info is included in text and medatada.text_as_html
|
Determines whether or not header info is included in text and medatada.text_as_html
|
||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
@ -94,7 +101,11 @@ def partition_xlsx(
|
|||||||
for sheet_name, sheet in sheets.items():
|
for sheet_name, sheet in sheets.items():
|
||||||
page_number += 1
|
page_number += 1
|
||||||
if not find_subtable:
|
if not find_subtable:
|
||||||
html_text = sheet.to_html(index=False, header=include_header, na_rep="")
|
html_text = (
|
||||||
|
sheet.to_html(index=False, header=include_header, na_rep="")
|
||||||
|
if infer_table_structure
|
||||||
|
else None
|
||||||
|
)
|
||||||
text = soupparser_fromstring(html_text).text_content()
|
text = soupparser_fromstring(html_text).text_content()
|
||||||
|
|
||||||
if include_metadata:
|
if include_metadata:
|
||||||
@ -158,7 +169,7 @@ def partition_xlsx(
|
|||||||
text = soupparser_fromstring(html_text).text_content()
|
text = soupparser_fromstring(html_text).text_content()
|
||||||
subtable = Table(text=text)
|
subtable = Table(text=text)
|
||||||
subtable.metadata = metadata
|
subtable.metadata = metadata
|
||||||
subtable.metadata.text_as_html = html_text
|
subtable.metadata.text_as_html = html_text if infer_table_structure else None
|
||||||
elements.append(subtable)
|
elements.append(subtable)
|
||||||
|
|
||||||
if front_non_consecutive is not None and last_non_consecutive is not None:
|
if front_non_consecutive is not None and last_non_consecutive is not None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user