2023-05-16 15:40:40 -04:00
|
|
|
from tempfile import SpooledTemporaryFile
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
from typing import IO, Any, BinaryIO, Dict, List, Optional, Tuple, Union, cast
|
2023-05-16 15:40:40 -04:00
|
|
|
|
2023-10-14 14:38:21 -05:00
|
|
|
import networkx as nx
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
import numpy as np
|
2023-05-16 15:40:40 -04:00
|
|
|
import pandas as pd
|
2023-08-13 12:20:33 -07:00
|
|
|
from lxml.html.soupparser import fromstring as soupparser_fromstring
|
2023-05-16 15:40:40 -04:00
|
|
|
|
2023-10-03 09:40:34 -07:00
|
|
|
from unstructured.chunking.title import add_chunking_strategy
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
from unstructured.cleaners.core import clean_bullets
|
2023-06-16 10:10:56 -04:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
|
Element,
|
|
|
|
|
ElementMetadata,
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
ListItem,
|
|
|
|
|
NarrativeText,
|
2023-06-16 10:10:56 -04:00
|
|
|
Table,
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
Text,
|
|
|
|
|
Title,
|
2023-06-16 10:10:56 -04:00
|
|
|
process_metadata,
|
|
|
|
|
)
|
2023-05-16 15:40:40 -04:00
|
|
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
2023-07-26 15:10:14 -04:00
|
|
|
from unstructured.partition.common import (
|
|
|
|
|
exactly_one,
|
|
|
|
|
get_last_modified_date,
|
|
|
|
|
get_last_modified_date_from_file,
|
|
|
|
|
spooled_to_bytes_io_if_needed,
|
|
|
|
|
)
|
2023-10-10 20:47:56 -05:00
|
|
|
from unstructured.partition.lang import apply_lang_metadata
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
from unstructured.partition.text_type import (
|
|
|
|
|
is_bulleted_text,
|
|
|
|
|
is_possible_narrative_text,
|
|
|
|
|
is_possible_numbered_list,
|
|
|
|
|
is_possible_title,
|
|
|
|
|
)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
2023-10-05 15:26:47 -05:00
|
|
|
DETECTION_ORIGIN: str = "xlsx"
|
|
|
|
|
|
2023-05-16 15:40:40 -04:00
|
|
|
|
2023-06-16 10:10:56 -04:00
|
|
|
@process_metadata()
|
2023-05-16 15:40:40 -04:00
|
|
|
@add_metadata_with_filetype(FileType.XLSX)
|
2023-10-03 09:40:34 -07:00
|
|
|
@add_chunking_strategy()
|
2023-05-16 15:40:40 -04:00
|
|
|
def partition_xlsx(
|
|
|
|
|
filename: Optional[str] = None,
|
2023-07-05 22:37:31 +02:00
|
|
|
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
|
2023-05-16 15:40:40 -04:00
|
|
|
metadata_filename: Optional[str] = None,
|
|
|
|
|
include_metadata: bool = True,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages: Optional[List[str]] = ["auto"],
|
|
|
|
|
detect_language_per_element: bool = False,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified: Optional[str] = None,
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
include_header: bool = False,
|
|
|
|
|
find_subtable: bool = True,
|
2023-06-16 10:10:56 -04:00
|
|
|
**kwargs,
|
2023-05-16 15:40:40 -04:00
|
|
|
) -> List[Element]:
|
|
|
|
|
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
filename
|
|
|
|
|
A string defining the target filename path.
|
|
|
|
|
file
|
|
|
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
|
|
|
|
include_metadata
|
|
|
|
|
Determines whether or not metadata is included in the output.
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
languages
|
2023-10-10 20:47:56 -05:00
|
|
|
User defined value for metadata.languages if provided. Otherwise language is detected
|
|
|
|
|
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
|
|
|
|
in either language.
|
|
|
|
|
Additional Parameters:
|
|
|
|
|
detect_language_per_element
|
|
|
|
|
Detect language per element instead of at the document level.
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified
|
2023-07-26 15:10:14 -04:00
|
|
|
The day of the last modification
|
2023-08-16 23:16:23 -05:00
|
|
|
include_header
|
|
|
|
|
Determines whether or not header info info is included in text and medatada.text_as_html
|
2023-05-16 15:40:40 -04:00
|
|
|
"""
|
|
|
|
|
exactly_one(filename=filename, file=file)
|
2023-10-10 20:47:56 -05:00
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
last_modification_date = None
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
header = 0 if include_header else None
|
|
|
|
|
|
2023-05-16 15:40:40 -04:00
|
|
|
if filename:
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
sheets = pd.read_excel(filename, sheet_name=None, header=header)
|
2023-07-26 15:10:14 -04:00
|
|
|
last_modification_date = get_last_modified_date(filename)
|
|
|
|
|
|
|
|
|
|
elif file:
|
|
|
|
|
f = spooled_to_bytes_io_if_needed(
|
|
|
|
|
cast(Union[BinaryIO, SpooledTemporaryFile], file),
|
|
|
|
|
)
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
sheets = pd.read_excel(f, sheet_name=None, header=header)
|
2023-07-26 15:10:14 -04:00
|
|
|
last_modification_date = get_last_modified_date_from_file(file)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
|
elements: List[Element] = []
|
|
|
|
|
page_number = 0
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
for sheet_name, sheet in sheets.items():
|
2023-05-16 15:40:40 -04:00
|
|
|
page_number += 1
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
if not find_subtable:
|
|
|
|
|
html_text = sheet.to_html(index=False, header=include_header, na_rep="")
|
|
|
|
|
text = soupparser_fromstring(html_text).text_content()
|
|
|
|
|
|
|
|
|
|
if include_metadata:
|
|
|
|
|
metadata = ElementMetadata(
|
|
|
|
|
text_as_html=html_text,
|
|
|
|
|
page_name=sheet_name,
|
|
|
|
|
page_number=page_number,
|
|
|
|
|
filename=metadata_filename or filename,
|
|
|
|
|
last_modified=metadata_last_modified or last_modification_date,
|
|
|
|
|
)
|
2023-10-05 15:26:47 -05:00
|
|
|
metadata.detection_origin = DETECTION_ORIGIN
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
else:
|
|
|
|
|
metadata = ElementMetadata()
|
|
|
|
|
|
|
|
|
|
table = Table(text=text, metadata=metadata)
|
|
|
|
|
elements.append(table)
|
2023-05-16 15:40:40 -04:00
|
|
|
else:
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
_connected_components = _get_connected_components(sheet)
|
|
|
|
|
for _connected_component, _min_max_coords in _connected_components:
|
|
|
|
|
min_x, min_y, max_x, max_y = _min_max_coords
|
|
|
|
|
|
|
|
|
|
subtable = sheet.iloc[min_x : max_x + 1, min_y : max_y + 1] # noqa: E203
|
|
|
|
|
single_non_empty_rows, single_non_empty_row_contents = _single_non_empty_rows(
|
|
|
|
|
subtable,
|
|
|
|
|
)
|
|
|
|
|
(
|
|
|
|
|
front_non_consecutive,
|
|
|
|
|
last_non_consecutive,
|
|
|
|
|
) = _find_first_and_last_non_consecutive_row(
|
|
|
|
|
single_non_empty_rows,
|
|
|
|
|
subtable.shape,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
metadata = _get_metadata(
|
|
|
|
|
include_metadata,
|
|
|
|
|
sheet_name,
|
|
|
|
|
page_number,
|
|
|
|
|
metadata_filename or filename,
|
|
|
|
|
metadata_last_modified or last_modification_date,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# NOTE(klaijan) - need to explicitly define the condition to avoid the case of 0
|
|
|
|
|
if front_non_consecutive is not None and last_non_consecutive is not None:
|
|
|
|
|
first_row = int(front_non_consecutive - max_x)
|
|
|
|
|
last_row = int(max_x - last_non_consecutive)
|
|
|
|
|
subtable = _get_sub_subtable(subtable, (first_row, last_row))
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
if front_non_consecutive is not None:
|
|
|
|
|
for content in single_non_empty_row_contents[: front_non_consecutive + 1]:
|
|
|
|
|
element = _check_content_element_type(str(content))
|
|
|
|
|
element.metadata = metadata
|
|
|
|
|
elements.append(element)
|
|
|
|
|
|
|
|
|
|
if subtable is not None and len(subtable) == 1:
|
|
|
|
|
element = _check_content_element_type(str(subtable.iloc[0].values[0]))
|
|
|
|
|
elements.append(element)
|
|
|
|
|
|
|
|
|
|
elif subtable is not None:
|
|
|
|
|
# parse subtables as html
|
|
|
|
|
html_text = subtable.to_html(index=False, header=include_header, na_rep="")
|
|
|
|
|
text = soupparser_fromstring(html_text).text_content()
|
|
|
|
|
subtable = Table(text=text)
|
|
|
|
|
subtable.metadata = metadata
|
|
|
|
|
subtable.metadata.text_as_html = html_text
|
|
|
|
|
elements.append(subtable)
|
|
|
|
|
|
|
|
|
|
if front_non_consecutive is not None and last_non_consecutive is not None:
|
|
|
|
|
for content in single_non_empty_row_contents[
|
|
|
|
|
front_non_consecutive + 1 : # noqa: E203
|
|
|
|
|
]:
|
|
|
|
|
element = _check_content_element_type(str(content))
|
|
|
|
|
element.metadata = metadata
|
|
|
|
|
elements.append(element)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = list(
|
|
|
|
|
apply_lang_metadata(
|
|
|
|
|
elements=elements,
|
|
|
|
|
languages=languages,
|
|
|
|
|
detect_language_per_element=detect_language_per_element,
|
|
|
|
|
),
|
|
|
|
|
)
|
2023-05-16 15:40:40 -04:00
|
|
|
return elements
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_connected_components(
|
|
|
|
|
sheet: pd.DataFrame,
|
|
|
|
|
filter: bool = True,
|
|
|
|
|
):
|
|
|
|
|
"""
|
|
|
|
|
Identify connected components of non-empty cells in an excel sheet.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
sheet: an excel sheet read in DataFrame.
|
|
|
|
|
filter (bool, optional): If True (default), filters out overlapping components
|
|
|
|
|
to return distinct components.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A list of tuples, each containing:
|
|
|
|
|
- A list of tuples representing the connected component's cell coordinates.
|
|
|
|
|
- A tuple with the min and max x and y coordinates bounding the connected component.
|
|
|
|
|
|
|
|
|
|
Note:
|
|
|
|
|
This function performs a depth-first search (DFS) to identify connected components of
|
|
|
|
|
non-empty cells in the sheet. If 'filter' is set to True, it also filters out
|
|
|
|
|
overlapping components to return distinct components.
|
|
|
|
|
"""
|
|
|
|
|
max_row, max_col = sheet.shape
|
2023-10-14 14:38:21 -05:00
|
|
|
graph: nx.Graph = nx.grid_2d_graph(max_row, max_col)
|
|
|
|
|
node_array = np.indices((max_row, max_col)).T
|
|
|
|
|
empty_cells = sheet.isna().T
|
|
|
|
|
nodes_to_remove = [tuple(pair) for pair in node_array[empty_cells]]
|
|
|
|
|
graph.remove_nodes_from(nodes_to_remove)
|
|
|
|
|
connected_components_as_nodes = nx.connected_components(graph)
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
connected_components = []
|
2023-10-14 14:38:21 -05:00
|
|
|
for _component in connected_components_as_nodes:
|
|
|
|
|
component = list(_component)
|
|
|
|
|
min_x, min_y, max_x, max_y = _find_min_max_coord(component)
|
|
|
|
|
connected_components.append(
|
|
|
|
|
{
|
|
|
|
|
"component": component,
|
|
|
|
|
"min_x": min_x,
|
|
|
|
|
"min_y": min_y,
|
|
|
|
|
"max_x": max_x,
|
|
|
|
|
"max_y": max_y,
|
|
|
|
|
},
|
|
|
|
|
)
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
|
|
|
|
if filter:
|
|
|
|
|
connected_components = _filter_overlapping_tables(connected_components)
|
|
|
|
|
return [
|
|
|
|
|
(
|
|
|
|
|
connected_component["component"],
|
|
|
|
|
(
|
|
|
|
|
connected_component["min_x"],
|
|
|
|
|
connected_component["min_y"],
|
|
|
|
|
connected_component["max_x"],
|
|
|
|
|
connected_component["max_y"],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
for connected_component in connected_components
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _filter_overlapping_tables(
|
2023-10-14 14:38:21 -05:00
|
|
|
connected_components: List[Dict[str, Any]],
|
|
|
|
|
) -> List[Dict[str, Any]]:
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
"""
|
|
|
|
|
Filter out overlapping connected components to return distinct components.
|
|
|
|
|
"""
|
|
|
|
|
sorted_components = sorted(connected_components, key=lambda x: x["min_x"])
|
|
|
|
|
merged_components: List[dict] = []
|
|
|
|
|
current_component = None
|
|
|
|
|
for component in sorted_components:
|
|
|
|
|
if current_component is None:
|
|
|
|
|
current_component = component
|
|
|
|
|
else:
|
|
|
|
|
# Check if component overlaps with the current_component
|
|
|
|
|
if component["min_x"] <= current_component["max_x"]:
|
|
|
|
|
# Merge the components and update min_x, max_x
|
|
|
|
|
current_component["component"].extend(component["component"])
|
|
|
|
|
current_component["min_x"] = min(current_component["min_x"], component["min_x"])
|
|
|
|
|
current_component["max_x"] = max(current_component["max_x"], component["max_x"])
|
|
|
|
|
current_component["min_y"] = min(current_component["min_y"], component["min_y"])
|
|
|
|
|
current_component["max_y"] = max(current_component["max_y"], component["max_y"])
|
|
|
|
|
else:
|
|
|
|
|
# No overlap, add the current_component to the merged list
|
|
|
|
|
merged_components.append(current_component)
|
|
|
|
|
# Update the current_component
|
|
|
|
|
current_component = component
|
|
|
|
|
# Append the last current_component to the merged list
|
|
|
|
|
if current_component is not None:
|
|
|
|
|
merged_components.append(current_component)
|
|
|
|
|
return merged_components
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _find_min_max_coord(
|
|
|
|
|
connected_component: List[Dict[Any, Any]],
|
|
|
|
|
) -> Tuple[Union[int, float], Union[int, float], Union[int, float], Union[int, float]]:
|
|
|
|
|
"""
|
|
|
|
|
Find the minimum and maximum coordinates (bounding box) of a connected component.
|
|
|
|
|
"""
|
|
|
|
|
min_x, min_y, max_x, max_y = float("inf"), float("inf"), float("-inf"), float("-inf")
|
|
|
|
|
for _x, _y in connected_component:
|
|
|
|
|
if _x < min_x:
|
|
|
|
|
min_x = _x
|
|
|
|
|
if _y < min_y:
|
|
|
|
|
min_y = _y
|
|
|
|
|
if _x > max_x:
|
|
|
|
|
max_x = _x
|
|
|
|
|
if _y > max_y:
|
|
|
|
|
max_y = _y
|
|
|
|
|
return min_x, min_y, max_x, max_y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_sub_subtable(subtable: pd.DataFrame, first_and_last_row: Tuple[int, int]) -> pd.DataFrame:
|
|
|
|
|
"""
|
|
|
|
|
Extract a sub-subtable from a given subtable based on the first and last row range.
|
|
|
|
|
"""
|
|
|
|
|
# TODO(klaijan) - to further check for sub subtable, we could check whether
|
|
|
|
|
# two consecutive rows contains full row of cells.
|
|
|
|
|
# if yes, it might not be a header. We should check the length.
|
|
|
|
|
first_row, last_row = first_and_last_row
|
|
|
|
|
if last_row == first_row:
|
|
|
|
|
return None
|
|
|
|
|
return subtable.iloc[first_row : last_row + 1] # noqa: E203
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _find_first_and_last_non_consecutive_row(
|
|
|
|
|
row_indices: List[int],
|
|
|
|
|
table_shape: Tuple[int, int],
|
|
|
|
|
) -> Tuple[Optional[int], Optional[int]]:
|
|
|
|
|
"""
|
|
|
|
|
Find the indices of the first and last non-consecutive rows in a list of row indices.
|
|
|
|
|
"""
|
|
|
|
|
# If the table is a single column with one or more rows
|
|
|
|
|
table_rows, table_cols = table_shape
|
|
|
|
|
if len(row_indices) == 1 or (len(row_indices) == table_rows and table_cols == 1):
|
|
|
|
|
return row_indices[0], row_indices[0]
|
|
|
|
|
|
|
|
|
|
arr = np.array(row_indices)
|
|
|
|
|
front_non_consecutive = next(
|
|
|
|
|
(i for i, (x, y) in enumerate(zip(arr, arr[1:])) if x + 1 != y),
|
|
|
|
|
None,
|
|
|
|
|
)
|
|
|
|
|
reversed_arr = arr[::-1] # Reverse the array
|
|
|
|
|
last_non_consecutive = next(
|
|
|
|
|
(i for i, (x, y) in enumerate(zip(reversed_arr, reversed_arr[1:])) if x - 1 != y),
|
|
|
|
|
None,
|
|
|
|
|
)
|
|
|
|
|
return front_non_consecutive, last_non_consecutive
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _single_non_empty_rows(subtable) -> Tuple[List[int], List[str]]:
|
|
|
|
|
"""
|
|
|
|
|
Identify single non-empty rows in a subtable and extract their row indices and contents.
|
|
|
|
|
"""
|
|
|
|
|
single_non_empty_rows = []
|
|
|
|
|
single_non_empty_row_contents = []
|
|
|
|
|
for index, row in subtable.iterrows():
|
|
|
|
|
if row.count() == 1:
|
|
|
|
|
single_non_empty_rows.append(index)
|
|
|
|
|
single_non_empty_row_contents.append(row.dropna().iloc[0])
|
|
|
|
|
return single_non_empty_rows, single_non_empty_row_contents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _check_content_element_type(text: str) -> Element:
|
|
|
|
|
"""
|
|
|
|
|
Classify the type of content element based on its text.
|
|
|
|
|
"""
|
|
|
|
|
if is_bulleted_text(text):
|
|
|
|
|
return ListItem(
|
|
|
|
|
text=clean_bullets(text),
|
|
|
|
|
)
|
|
|
|
|
elif is_possible_numbered_list(text):
|
|
|
|
|
return ListItem(
|
|
|
|
|
text=text,
|
|
|
|
|
)
|
|
|
|
|
elif is_possible_narrative_text(text):
|
|
|
|
|
return NarrativeText(
|
|
|
|
|
text=text,
|
|
|
|
|
)
|
|
|
|
|
elif is_possible_title(text):
|
|
|
|
|
return Title(
|
|
|
|
|
text=text,
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
return Text(
|
|
|
|
|
text=text,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_metadata(
|
|
|
|
|
include_metadata: bool = True,
|
|
|
|
|
sheet_name: Optional[str] = None,
|
|
|
|
|
page_number: Optional[int] = -1,
|
|
|
|
|
filename: Optional[str] = None,
|
|
|
|
|
last_modification_date: Union[str, None] = None,
|
|
|
|
|
) -> ElementMetadata:
|
|
|
|
|
"""Returns metadata depending on `include_metadata` flag"""
|
|
|
|
|
if include_metadata:
|
|
|
|
|
metadata = ElementMetadata(
|
|
|
|
|
page_name=sheet_name,
|
|
|
|
|
page_number=page_number,
|
|
|
|
|
filename=filename,
|
|
|
|
|
last_modified=last_modification_date,
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
metadata = ElementMetadata()
|
|
|
|
|
return metadata
|