enhancement: Add include_header kwarg for xlsx, default True(#1125)

Closes Github issue #1121

Adds include_header kwarg to partition_xlsx and change default behavior to True.
This commit is contained in:
John 2023-08-16 23:16:23 -05:00 committed by GitHub
parent 22c12ef806
commit 9f7bd6127b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 67 additions and 39 deletions

View File

@ -1,16 +1,15 @@
## 0.10.1-dev3
## 0.10.1
### Enhancements
* Bump unstructured-inference==0.5.12:
- fix to avoid trace for certain PDF's
* Bump unstructured-inference==0.5.11:
- better defaults for DPI for hi_res and Chipper
* Bump unstructured-inference==0.5.10:
- implement full-page OCR
- fix to avoid trace for certain PDF's (0.5.12)
- better defaults for DPI for hi_res and Chipper (0.5.11)
- implement full-page OCR (0.5.10)
### Features
### Fixes
* Fix dead links in repository README (Quick Start > Install for local development, and Learn more > Batch Processing)
* Update document dependencies to include tesseract-lang for additional language support (required for tests to pass)
@ -18,6 +17,7 @@
### Enhancements
* Add `include_header` kwarg to `partition_xlsx` and change default behavior to `True`
* Update the `links` and `emphasized_texts` metadata fields
### Features
@ -26,6 +26,7 @@
* fix pdf partition of list items being detected as titles in OCR only mode
## 0.9.3
### Enhancements

View File

@ -668,7 +668,7 @@ EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsh
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition(filename=filename)
elements = partition(filename=filename, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
@ -681,7 +681,7 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition(file=f)
elements = partition(file=f, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
@ -774,7 +774,7 @@ EXPECTED_XLS_TABLE = (
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
elements = partition(filename=filename)
elements = partition(filename=filename, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 3

View File

@ -9,7 +9,7 @@ EXCEPTED_PAGE_NAME = "Stanley Cups"
def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename)
elements = partition_xlsx(filename=filename, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
@ -23,7 +23,7 @@ def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx")
def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xlsx"):
elements = partition_xlsx(filename=filename)
elements = partition_xlsx(filename=filename, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 1
assert clean_extra_whitespace(elements[0].text) == "🤠😅"
@ -32,16 +32,27 @@ def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xl
def test_partition_xlsx_from_filename_with_metadata_filename(
filename="example-docs/stanley-cups.xlsx",
):
elements = partition_xlsx(filename=filename, metadata_filename="test")
elements = partition_xlsx(filename=filename, metadata_filename="test", include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.filename == "test"
def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_header=True)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert (
clean_extra_whitespace(elements[0].text)
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT
)
assert "<thead>" in elements[0].metadata.text_as_html
def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f)
elements = partition_xlsx(file=f, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
@ -55,15 +66,28 @@ def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
def test_partition_xlsx_from_file_with_metadata_filename(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, metadata_filename="test")
elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.filename == "test"
def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, include_header=True)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert (
clean_extra_whitespace(elements[0].text)
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT
)
assert "<thead>" in elements[0].metadata.text_as_html
def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_metadata=False)
elements = partition_xlsx(filename=filename, include_metadata=False, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
@ -78,7 +102,7 @@ def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley
def test_partition_xlsx_from_file_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, include_metadata=False)
elements = partition_xlsx(file=f, include_metadata=False, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2

View File

@ -1,7 +1,7 @@
[
{
"type": "Table",
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
"element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6",
"metadata": {
"data_source": {
"record_locator": {
@ -16,13 +16,13 @@
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
"text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
},
{
"type": "Table",
"element_id": "31421b5cd94fedb10dc82738503b4505",
"element_id": "0699dddf33814117e04654068f5182f6",
"metadata": {
"data_source": {
"record_locator": {
@ -37,8 +37,8 @@
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups Since 67</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
"text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
}
]

View File

@ -1,26 +1,26 @@
[
{
"type": "Table",
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
"element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6",
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
"text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
},
{
"type": "Table",
"element_id": "31421b5cd94fedb10dc82738503b4505",
"element_id": "0699dddf33814117e04654068f5182f6",
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups Since 67</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
"text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
}
]

View File

@ -1,41 +1,41 @@
[
{
"type": "Table",
"element_id": "677f7fdbfa79de9d91e157663dd559cd",
"element_id": "0e2d044a26942328e2b8647574232e7f",
"metadata": {
"data_source": {},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"page_number": 1,
"page_name": "Example Test",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>MC</th>\n <th>What is 2+2?</th>\n <th>4</th>\n <th>correct</th>\n <th>3</th>\n <th>incorrect</th>\n <th>Unnamed: 6</th>\n <th>Unnamed: 7</th>\n <th>Unnamed: 8</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
"text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\nUnnamed: 6\nUnnamed: 7\nUnnamed: 8\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
},
{
"type": "Table",
"element_id": "079ef3ee8c03cb36789b08765181ebc4",
"element_id": "5c56dd4c5b649b873ebd848312e66753",
"metadata": {
"data_source": {},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"page_number": 2,
"page_name": "Format Abbr.",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Question Format Abbreviations</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Unnamed: 0</th>\n <th>Unnamed: 1</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Question Format Abbreviations</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\n\n\nQuestion Format Abbreviations\n\n\n\n\n\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
"text": "\n\n\nUnnamed: 0\nUnnamed: 1\n\n\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\n\n\nQuestion Format Abbreviations\n\n\n\n\n\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
},
{
"type": "Table",
"element_id": "c7b7d8780a970d589554c3784283b67e",
"element_id": "f48657c4eb70d98975e567248d0ef4bb",
"metadata": {
"data_source": {},
"filename": "tests-example.xls",
"filetype": "application/vnd.ms-excel",
"page_number": 3,
"page_name": "Readme",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>File Information</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Source</td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Version</td>\n </tr>\n <tr>\n <td>1.0 (January 2012)</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Contact</td>\n </tr>\n <tr>\n <td>bb-help@andrew.cmu.edu</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>About</td>\n </tr>\n <tr>\n <td>This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Unnamed: 0</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>File Information</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Source</td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Version</td>\n </tr>\n <tr>\n <td>1.0 (January 2012)</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Contact</td>\n </tr>\n <tr>\n <td>bb-help@andrew.cmu.edu</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>About</td>\n </tr>\n <tr>\n <td>This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\nFile Information\n\n\n\n\n\n\n\n\nSource\n\n\nhttp://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n\n\n\n\n\n\n\n\nVersion\n\n\n1.0 (January 2012)\n\n\n\n\n\n\n\n\nContact\n\n\nbb-help@andrew.cmu.edu\n\n\n\n\n\n\n\n\nAbout\n\n\nThis is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n\n\n"
"text": "\n\n\nUnnamed: 0\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\nFile Information\n\n\n\n\n\n\n\n\nSource\n\n\nhttp://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n\n\n\n\n\n\n\n\nVersion\n\n\n1.0 (January 2012)\n\n\n\n\n\n\n\n\nContact\n\n\nbb-help@andrew.cmu.edu\n\n\n\n\n\n\n\n\nAbout\n\n\nThis is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n\n\n"
}
]

View File

@ -1 +1 @@
__version__ = "0.10.1-dev3" # pragma: no cover
__version__ = "0.10.1" # pragma: no cover

View File

@ -27,6 +27,7 @@ def partition_xlsx(
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
metadata_last_modified: Optional[str] = None,
include_header: bool = True,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
@ -41,6 +42,8 @@ def partition_xlsx(
Determines whether or not metadata is included in the output.
metadata_last_modified
The day of the last modification
include_header
Determines whether or not header info info is included in text and medatada.text_as_html
"""
exactly_one(filename=filename, file=file)
last_modification_date = None
@ -59,7 +62,7 @@ def partition_xlsx(
page_number = 0
for sheet_name, table in sheets.items():
page_number += 1
html_text = table.to_html(index=False, header=False, na_rep="")
html_text = table.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
if include_metadata: