2023-10-14 14:38:21 -05:00
|
|
|
import sys
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
import pytest
|
|
|
|
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
from test_unstructured.partition.test_constants import (
|
|
|
|
EXPECTED_TABLE_XLSX,
|
|
|
|
EXPECTED_TEXT_XLSX,
|
|
|
|
EXPECTED_TITLE,
|
|
|
|
)
|
2023-10-12 12:47:55 -07:00
|
|
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
2023-05-16 15:40:40 -04:00
|
|
|
from unstructured.cleaners.core import clean_extra_whitespace
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
from unstructured.documents.elements import Table, Text, Title
|
2023-05-16 15:40:40 -04:00
|
|
|
from unstructured.partition.xlsx import partition_xlsx
|
|
|
|
|
|
|
|
EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
|
|
|
2023-05-18 16:53:23 +03:00
|
|
|
EXCEPTED_PAGE_NAME = "Stanley Cups"
|
|
|
|
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(filename=filename, include_header=False)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
|
|
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert elements[1].metadata.page_number == 1
|
|
|
|
assert elements[1].metadata.filetype == EXPECTED_FILETYPE
|
|
|
|
assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME
|
|
|
|
assert elements[1].metadata.filename == "stanley-cups.xlsx"
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
2023-08-04 16:14:08 +02:00
|
|
|
def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xlsx"):
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(filename=filename, include_header=False)
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Text) for element in elements) == 1
|
2023-08-13 12:20:33 -07:00
|
|
|
assert len(elements) == 1
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == "🤠😅"
|
2023-08-04 16:14:08 +02:00
|
|
|
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
def test_partition_xlsx_from_filename_with_metadata_filename(
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(filename=filename, metadata_filename="test", include_header=False)
|
2023-07-05 15:02:22 -05:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
|
2023-07-05 15:02:22 -05:00
|
|
|
assert elements[0].metadata.filename == "test"
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
|
2023-10-23 17:11:53 -07:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"infer_table_structure",
|
|
|
|
[
|
|
|
|
True,
|
|
|
|
False,
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_xlsx_infer_table_structure(
|
|
|
|
infer_table_structure,
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
|
|
|
elements = partition_xlsx(filename=filename, infer_table_structure=infer_table_structure)
|
|
|
|
table_elements = [e for e in elements if isinstance(e, Table)]
|
|
|
|
for table_element in table_elements:
|
|
|
|
table_element_has_text_as_html_field = (
|
|
|
|
hasattr(table_element.metadata, "text_as_html")
|
|
|
|
and table_element.metadata.text_as_html is not None
|
|
|
|
)
|
|
|
|
assert table_element_has_text_as_html_field == infer_table_structure
|
|
|
|
|
|
|
|
|
2023-08-16 23:16:23 -05:00
|
|
|
def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
elements = partition_xlsx(filename=filename, include_header=True)
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
2023-08-16 23:16:23 -05:00
|
|
|
assert len(elements) == 2
|
|
|
|
assert (
|
|
|
|
clean_extra_whitespace(elements[0].text)
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
|
2023-08-16 23:16:23 -05:00
|
|
|
)
|
|
|
|
assert "<thead>" in elements[0].metadata.text_as_html
|
|
|
|
|
|
|
|
|
2023-05-16 15:40:40 -04:00
|
|
|
def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
with open(filename, "rb") as f:
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(file=f, include_header=False)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
|
|
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert elements[1].metadata.page_number == 1
|
|
|
|
assert elements[1].metadata.filetype == EXPECTED_FILETYPE
|
|
|
|
assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME
|
|
|
|
assert elements[1].metadata.filename is None
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_from_file_with_metadata_filename(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
with open(filename, "rb") as f:
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)
|
2023-07-05 15:02:22 -05:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert elements[1].metadata.filename == "test"
|
2023-05-16 15:40:40 -04:00
|
|
|
|
|
|
|
|
2023-08-16 23:16:23 -05:00
|
|
|
def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_xlsx(file=f, include_header=True)
|
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
2023-08-16 23:16:23 -05:00
|
|
|
assert len(elements) == 2
|
|
|
|
assert (
|
|
|
|
clean_extra_whitespace(elements[0].text)
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
|
2023-08-16 23:16:23 -05:00
|
|
|
)
|
|
|
|
assert "<thead>" in elements[0].metadata.text_as_html
|
|
|
|
|
|
|
|
|
2023-06-30 09:44:46 -05:00
|
|
|
def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(filename=filename, include_metadata=False, include_header=False)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
2023-05-16 15:40:40 -04:00
|
|
|
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert elements[1].metadata.text_as_html is None
|
|
|
|
assert elements[1].metadata.page_number is None
|
|
|
|
assert elements[1].metadata.filetype is None
|
|
|
|
assert elements[1].metadata.page_name is None
|
|
|
|
assert elements[1].metadata.filename is None
|
2023-06-30 09:44:46 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_from_file_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
with open(filename, "rb") as f:
|
2023-08-16 23:16:23 -05:00
|
|
|
elements = partition_xlsx(file=f, include_metadata=False, include_header=False)
|
2023-06-30 09:44:46 -05:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
2023-06-30 09:44:46 -05:00
|
|
|
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT_XLSX
|
2023-06-30 09:44:46 -05:00
|
|
|
assert elements[0].metadata.text_as_html is None
|
|
|
|
assert elements[0].metadata.page_number is None
|
|
|
|
assert elements[0].metadata.filetype is None
|
|
|
|
assert elements[0].metadata.page_name is None
|
2023-07-05 15:02:22 -05:00
|
|
|
assert elements[0].metadata.filename is None
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.xlsx.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_xlsx(
|
|
|
|
filename=filename,
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_with_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.xlsx.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_xlsx(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_from_file_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.xlsx.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_xlsx(
|
|
|
|
file=f,
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_from_file_with_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/stanley-cups.xlsx",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.xlsx.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
2023-07-31 19:55:43 -07:00
|
|
|
elements = partition_xlsx(file=f, metadata_last_modified=expected_last_modification_date)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-08-29 16:59:26 -04:00
|
|
|
|
|
|
|
|
2023-10-12 12:47:55 -07:00
|
|
|
def test_partition_xlsx_with_json():
|
|
|
|
elements = partition_xlsx(example_doc_path("stanley-cups.xlsx"), include_header=False)
|
|
|
|
assert_round_trips_through_JSON(elements)
|
feat: xlsx subtable extraction (#1585)
**Executive Summary**
Unstructured is now able to capture subtables, along with other text
element types within the `.xlsx` sheet.
**Technical Details**
- The function now reads the excel *without* header as default
- Leverages the connected components search to find subtables within the
sheet. This search is based on dfs search
- It also handle the overlapping table or text cases
- Row with only single cell of data is considered not a table, and
therefore passed on the determine the element type as text
- In connected elements, it is possible to have table title, header, or
footer. We run the count for the first non-single empty rows from top
and bottom to determine those text
**Result**
This table now reads as:
<img width="747" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/2177850/6b8e6d01-4ca5-43f4-ae88-6104b0174ed2">
```
[
{
"type": "Title",
"element_id": "3315afd97f7f2ebcd450e7c939878429",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Financial performance"
},
{
"type": "Table",
"element_id": "17f5d512705be6f8812e5dbb801ba727",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "3315afd97f7f2ebcd450e7c939878429",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Quarterly revenue</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>1</td>\n </tr>\n <tr>\n <td>Group financial performance</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>2</td>\n </tr>\n <tr>\n <td>Segmental results</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>3</td>\n </tr>\n <tr>\n <td>Segmental analysis</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>4</td>\n </tr>\n <tr>\n <td>Cash flow</td>\n <td>FY 22</td>\n <td>FY 23</td>\n <td></td>\n <td>5</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n"
},
{
"type": "Title",
"element_id": "8a9db7161a02b427f8fda883656036e1",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Operational metrics"
},
{
"type": "Table",
"element_id": "d5d16f7bf9c7950cd45fae06e12e5847",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "8a9db7161a02b427f8fda883656036e1",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Mobile customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>6</td>\n </tr>\n <tr>\n <td>Fixed broadband customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>7</td>\n </tr>\n <tr>\n <td>Marketable homes passed</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>8</td>\n </tr>\n <tr>\n <td>TV customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>9</td>\n </tr>\n <tr>\n <td>Converged customers</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>10</td>\n </tr>\n <tr>\n <td>Mobile churn</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>11</td>\n </tr>\n <tr>\n <td>Mobile data usage</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>12</td>\n </tr>\n <tr>\n <td>Mobile ARPU</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n"
},
{
"type": "Title",
"element_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "Other"
},
{
"type": "Table",
"element_id": "080e1a745a2a3f2df22b6a08d33d59bb",
"metadata": {
"filename": "vodafone.xlsx",
"file_directory": "example-docs",
"last_modified": "2023-10-03T17:51:34",
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"parent_id": "f97e9da0e3b879f0a9df979ae260a5f7",
"languages": [
"spa",
"ita"
],
"page_number": 1,
"page_name": "Index",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Topic</td>\n <td>Period</td>\n <td></td>\n <td></td>\n <td>Page</td>\n </tr>\n <tr>\n <td>Average foreign exchange rates</td>\n <td>Nine quarters to 30 June 2023</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n <tr>\n <td>Guidance rates</td>\n <td>FY 23/24</td>\n <td></td>\n <td></td>\n <td>14</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n"
}
]
```
2023-10-04 13:30:23 -04:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skip("Needs to fix language detection for table. Currently detected as 'tur'")
|
|
|
|
def test_partition_xlsx_metadata_language_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
|
|
|
elements = partition_xlsx(filename=filename, include_header=False)
|
|
|
|
|
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
assert len(elements) == 4
|
|
|
|
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_subtables(filename="example-docs/vodafone.xlsx"):
|
|
|
|
elements = partition_xlsx(filename)
|
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 3
|
|
|
|
assert len(elements) == 6
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_element_metadata_has_languages():
|
|
|
|
filename = "example-docs/stanley-cups.xlsx"
|
|
|
|
elements = partition_xlsx(filename=filename)
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_eml_respects_detect_language_per_element():
|
|
|
|
filename = "example-docs/language-docs/eng_spa.xlsx"
|
|
|
|
elements = partition_xlsx(filename=filename, detect_language_per_element=True)
|
|
|
|
langs = {element.metadata.languages[0] for element in elements}
|
|
|
|
assert "eng" in langs
|
|
|
|
assert "spa" in langs
|
2023-10-14 14:38:21 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_xlsx_with_more_than_1k_cells():
|
|
|
|
old_recursion_limit = sys.getrecursionlimit()
|
|
|
|
try:
|
|
|
|
sys.setrecursionlimit(1000)
|
|
|
|
filename = "example-docs/more-than-1k-cells.xlsx"
|
|
|
|
partition_xlsx(filename=filename)
|
|
|
|
finally:
|
|
|
|
sys.setrecursionlimit(old_recursion_limit)
|