feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
import pytest
|
|
|
|
|
|
|
|
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT, EXPECTED_TITLE
|
2023-05-16 15:40:40 -04:00
|
|
|
from unstructured.cleaners.core import clean_extra_whitespace
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
from unstructured.documents.elements import Table, Text, Title
|
2023-08-29 16:59:26 -04:00
|
|
|
from unstructured.partition.json import partition_json
|
2023-05-16 15:40:40 -04:00
|
|
|
from unstructured.partition.xlsx import partition_xlsx
|
2023-08-29 16:59:26 -04:00
|
|
|
from unstructured.staging.base import elements_to_json
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
|
|
|
2023-05-18 16:53:23 +03:00
|
|
|
EXCEPTED_PAGE_NAME = "Stanley Cups"
|
|
|
|
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(filename=filename, include_header=False)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
|
|
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE
|
|
|
|
assert elements[1].metadata.page_number == 1
|
|
|
|
assert elements[1].metadata.filetype == EXPECTED_FILETYPE
|
|
|
|
assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME
|
|
|
|
assert elements[1].metadata.filename == "stanley-cups.xlsx"
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
2023-08-04 16:14:08 +02:00
|
|
|
def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xlsx"):
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(filename=filename, include_header=False)
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Text) for element in elements) == 1
|
2023-08-13 12:20:33 -07:00
|
|
|
assert len(elements) == 1
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == "🤠😅"
|
2023-08-04 16:14:08 +02:00
|
|
|
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
def test_partition_xlsx_from_filename_with_metadata_filename(
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(filename=filename, metadata_filename="test", include_header=False)
|
2023-07-05 15:02:22 -05:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
|
2023-07-05 15:02:22 -05:00
|
|
|
assert elements[0].metadata.filename == "test"
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
|
2023-08-16 23:16:23 -05:00
|
|
|
def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
elements = partition_xlsx(filename=filename, include_header=True)
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
2023-08-16 23:16:23 -05:00
|
|
|
assert len(elements) == 2
|
|
|
|
assert (
|
|
|
|
clean_extra_whitespace(elements[0].text)
|
|
|
|
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT
|
|
|
|
)
|
|
|
|
assert "<thead>" in elements[0].metadata.text_as_html
|
|
|
|
|
|
|
|
|
2023-05-16 15:40:40 -04:00
|
|
|
def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
with open(filename, "rb") as f:
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(file=f, include_header=False)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
|
|
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE
|
|
|
|
assert elements[1].metadata.page_number == 1
|
|
|
|
assert elements[1].metadata.filetype == EXPECTED_FILETYPE
|
|
|
|
assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME
|
|
|
|
assert elements[1].metadata.filename is None
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_from_file_with_metadata_filename(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
with open(filename, "rb") as f:
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)
|
2023-07-05 15:02:22 -05:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
|
|
|
|
assert elements[1].metadata.filename == "test"
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
|
2023-08-16 23:16:23 -05:00
|
|
|
def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_xlsx(file=f, include_header=True)
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
2023-08-16 23:16:23 -05:00
|
|
|
assert len(elements) == 2
|
|
|
|
assert (
|
|
|
|
clean_extra_whitespace(elements[0].text)
|
|
|
|
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT
|
|
|
|
)
|
|
|
|
assert "<thead>" in elements[0].metadata.text_as_html
|
|
|
|
|
|
|
|
|
2023-06-30 09:44:46 -05:00
|
|
|
def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(filename=filename, include_metadata=False, include_header=False)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
|
|
|
|
assert elements[1].metadata.text_as_html is None
|
|
|
|
assert elements[1].metadata.page_number is None
|
|
|
|
assert elements[1].metadata.filetype is None
|
|
|
|
assert elements[1].metadata.page_name is None
|
|
|
|
assert elements[1].metadata.filename is None
|
2023-06-30 09:44:46 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_from_file_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
with open(filename, "rb") as f:
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(file=f, include_metadata=False, include_header=False)
|
2023-06-30 09:44:46 -05:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
2023-06-30 09:44:46 -05:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT
|
2023-06-30 09:44:46 -05:00
|
|
|
assert elements[0].metadata.text_as_html is None
|
|
|
|
assert elements[0].metadata.page_number is None
|
|
|
|
assert elements[0].metadata.filetype is None
|
|
|
|
assert elements[0].metadata.page_name is None
|
2023-07-05 15:02:22 -05:00
|
|
|
assert elements[0].metadata.filename is None
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.xlsx.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_xlsx(
|
|
|
|
filename=filename,
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_with_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.xlsx.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_xlsx(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_from_file_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.xlsx.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_xlsx(
|
|
|
|
file=f,
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_from_file_with_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.xlsx.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
2023-07-31 19:55:43 -07:00
|
|
|
elements = partition_xlsx(file=f, metadata_last_modified=expected_last_modification_date)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-08-29 16:59:26 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_with_json(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
elements = partition_xlsx(filename=filename, include_header=False)
|
|
|
|
test_elements = partition_json(text=elements_to_json(elements))
|
|
|
|
|
|
|
|
assert len(elements) == len(test_elements)
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == clean_extra_whitespace(test_elements[0].text)
|
|
|
|
assert elements[0].metadata.text_as_html == test_elements[0].metadata.text_as_html
|
|
|
|
assert elements[0].metadata.page_number == test_elements[0].metadata.page_number
|
|
|
|
assert elements[0].metadata.page_name == test_elements[0].metadata.page_name
|
|
|
|
assert elements[0].metadata.filename == test_elements[0].metadata.filename
|
|
|
|
|
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i] == test_elements[i]
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skip("Needs to fix language detection for table. Currently detected as 'tur'")
|
|
|
|
def test_partition_xlsx_metadata_language_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
elements = partition_xlsx(filename=filename, include_header=False)
|
|
|
|
|
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
|
|
|
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_subtables(filename="example-docs/vodafone.xlsx"):
|
|
|
|
elements = partition_xlsx(filename)
|
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 3
|
|
|
|
assert len(elements) == 6
|